github.com/matrixorigin/matrixone@v0.7.0/pkg/logservice/store_hakeeper_check_test.go (about)

     1  // Copyright 2021 - 2022 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package logservice
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"os"
    21  	"testing"
    22  	"time"
    23  
    24  	"github.com/google/uuid"
    25  	"github.com/lni/dragonboat/v4"
    26  	"github.com/lni/goutils/leaktest"
    27  	"github.com/lni/vfs"
    28  	"github.com/stretchr/testify/assert"
    29  	"github.com/stretchr/testify/require"
    30  
    31  	"github.com/matrixorigin/matrixone/pkg/common/morpc"
    32  	"github.com/matrixorigin/matrixone/pkg/common/runtime"
    33  	"github.com/matrixorigin/matrixone/pkg/hakeeper"
    34  	pb "github.com/matrixorigin/matrixone/pkg/pb/logservice"
    35  	"github.com/matrixorigin/matrixone/pkg/pb/task"
    36  	"github.com/matrixorigin/matrixone/pkg/taskservice"
    37  	"github.com/matrixorigin/matrixone/pkg/testutil"
    38  )
    39  
    40  func TestIDAllocatorDefaultState(t *testing.T) {
    41  	alloc := newIDAllocator()
    42  	assert.Equal(t, uint64(0), alloc.Capacity())
    43  	v, ok := alloc.Next()
    44  	assert.False(t, ok)
    45  	assert.Equal(t, uint64(0), v)
    46  }
    47  
    48  func TestIDAllocatorCapacity(t *testing.T) {
    49  	tests := []struct {
    50  		next     uint64
    51  		last     uint64
    52  		capacity uint64
    53  	}{
    54  		{1, 1, 1},
    55  		{2, 1, 0},
    56  		{1, 2, 2},
    57  		{100, 200, 101},
    58  	}
    59  
    60  	for _, tt := range tests {
    61  		alloc := idAllocator{nextID: tt.next, lastID: tt.last}
    62  		assert.Equal(t, tt.capacity, alloc.Capacity())
    63  	}
    64  }
    65  
    66  func TestIDAllocatorSet(t *testing.T) {
    67  	alloc := idAllocator{nextID: 100, lastID: 200}
    68  	alloc.Set(hakeeper.K8SIDRangeEnd, hakeeper.K8SIDRangeEnd+100)
    69  	expected := idAllocator{
    70  		nextID: hakeeper.K8SIDRangeEnd,
    71  		lastID: hakeeper.K8SIDRangeEnd + 100,
    72  	}
    73  	assert.Equal(t, expected, alloc)
    74  }
    75  
    76  func TestIDAllocatorRejectInvalidSetInput(t *testing.T) {
    77  	alloc := idAllocator{nextID: 100, lastID: 200}
    78  	defer func() {
    79  		if r := recover(); r == nil {
    80  			t.Fatalf("failed to trigger panic")
    81  		}
    82  	}()
    83  	alloc.Set(300, 400)
    84  }
    85  
    86  func TestIDAllocatorNext(t *testing.T) {
    87  	tests := []struct {
    88  		next     uint64
    89  		last     uint64
    90  		capacity uint64
    91  	}{
    92  		{1, 1, 1},
    93  		{2, 1, 0},
    94  		{1, 2, 2},
    95  		{100, 200, 101},
    96  	}
    97  
    98  	for _, tt := range tests {
    99  		expected := tt.next
   100  		alloc := idAllocator{nextID: tt.next, lastID: tt.last}
   101  		for {
   102  			hasID := alloc.Capacity() != 0
   103  			v, ok := alloc.Next()
   104  			assert.Equal(t, hasID, ok)
   105  			if hasID {
   106  				assert.Equal(t, expected, v)
   107  				expected++
   108  			} else {
   109  				assert.Equal(t, uint64(0), v)
   110  				break
   111  			}
   112  		}
   113  	}
   114  }
   115  
   116  func TestHandleBootstrapFailure(t *testing.T) {
   117  	defer func() {
   118  		if r := recover(); r == nil {
   119  			t.Fatalf("failed to trigger panic")
   120  		}
   121  	}()
   122  	s := store{}
   123  	s.handleBootstrapFailure()
   124  }
   125  
   126  func runHAKeeperStoreTest(t *testing.T, startLogReplica bool, fn func(*testing.T, *store)) {
   127  	defer leaktest.AfterTest(t)()
   128  	cfg := getStoreTestConfig()
   129  	defer vfs.ReportLeakedFD(cfg.FS, t)
   130  	store, err := getTestStore(cfg, startLogReplica, nil)
   131  	assert.NoError(t, err)
   132  	defer func() {
   133  		assert.NoError(t, store.close())
   134  	}()
   135  	peers := make(map[uint64]dragonboat.Target)
   136  	peers[1] = store.id()
   137  	assert.NoError(t, store.startHAKeeperReplica(1, peers, false))
   138  	fn(t, store)
   139  }
   140  
   141  func runHakeeperTaskServiceTest(t *testing.T, fn func(*testing.T, *store, taskservice.TaskService)) {
   142  	defer leaktest.AfterTest(t)()
   143  	cfg := getStoreTestConfig()
   144  	cfg.HAKeeperConfig.CNStoreTimeout.Duration = 5 * time.Second
   145  	defer vfs.ReportLeakedFD(cfg.FS, t)
   146  
   147  	taskService := taskservice.NewTaskService(runtime.DefaultRuntime(), taskservice.NewMemTaskStorage())
   148  	defer taskService.StopScheduleCronTask()
   149  
   150  	store, err := getTestStore(cfg, false, taskService)
   151  	assert.NoError(t, err)
   152  	defer func() {
   153  		assert.NoError(t, store.close())
   154  	}()
   155  	peers := make(map[uint64]dragonboat.Target)
   156  	peers[1] = store.id()
   157  	assert.NoError(t, store.startHAKeeperReplica(1, peers, false))
   158  	fn(t, store, taskService)
   159  }
   160  
   161  func runHAKeeperClusterTest(t *testing.T, fn func(*testing.T, []*Service)) {
   162  	defer leaktest.AfterTest(t)()
   163  	cfg1 := Config{
   164  		UUID:                uuid.New().String(),
   165  		FS:                  vfs.NewStrictMem(),
   166  		DeploymentID:        1,
   167  		RTTMillisecond:      5,
   168  		DataDir:             "data-1",
   169  		ServiceAddress:      "127.0.0.1:9002",
   170  		RaftAddress:         "127.0.0.1:9000",
   171  		GossipAddress:       "127.0.0.1:9001",
   172  		GossipSeedAddresses: []string{"127.0.0.1:9011", "127.0.0.1:9021", "127.0.0.1:9031"},
   173  		DisableWorkers:      true,
   174  	}
   175  	cfg1.HAKeeperConfig.TickPerSecond = 10
   176  	cfg1.HAKeeperConfig.LogStoreTimeout.Duration = 5 * time.Second
   177  	cfg1.HAKeeperConfig.DNStoreTimeout.Duration = 10 * time.Second
   178  	cfg1.HAKeeperConfig.CNStoreTimeout.Duration = 5 * time.Second
   179  	cfg2 := Config{
   180  		UUID:                uuid.New().String(),
   181  		FS:                  vfs.NewStrictMem(),
   182  		DeploymentID:        1,
   183  		RTTMillisecond:      5,
   184  		DataDir:             "data-2",
   185  		ServiceAddress:      "127.0.0.1:9012",
   186  		RaftAddress:         "127.0.0.1:9010",
   187  		GossipAddress:       "127.0.0.1:9011",
   188  		GossipSeedAddresses: []string{"127.0.0.1:9001", "127.0.0.1:9021", "127.0.0.1:9031"},
   189  		DisableWorkers:      true,
   190  	}
   191  	cfg2.HAKeeperConfig.TickPerSecond = 10
   192  	cfg2.HAKeeperConfig.LogStoreTimeout.Duration = 5 * time.Second
   193  	cfg2.HAKeeperConfig.DNStoreTimeout.Duration = 10 * time.Second
   194  	cfg2.HAKeeperConfig.CNStoreTimeout.Duration = 5 * time.Second
   195  	cfg3 := Config{
   196  		UUID:                uuid.New().String(),
   197  		FS:                  vfs.NewStrictMem(),
   198  		DeploymentID:        1,
   199  		RTTMillisecond:      5,
   200  		DataDir:             "data-3",
   201  		ServiceAddress:      "127.0.0.1:9022",
   202  		RaftAddress:         "127.0.0.1:9020",
   203  		GossipAddress:       "127.0.0.1:9021",
   204  		GossipSeedAddresses: []string{"127.0.0.1:9001", "127.0.0.1:9011", "127.0.0.1:9031"},
   205  		DisableWorkers:      true,
   206  	}
   207  	cfg3.HAKeeperConfig.TickPerSecond = 10
   208  	cfg3.HAKeeperConfig.LogStoreTimeout.Duration = 5 * time.Second
   209  	cfg3.HAKeeperConfig.DNStoreTimeout.Duration = 10 * time.Second
   210  	cfg3.HAKeeperConfig.CNStoreTimeout.Duration = 5 * time.Second
   211  	cfg4 := Config{
   212  		UUID:                uuid.New().String(),
   213  		FS:                  vfs.NewStrictMem(),
   214  		DeploymentID:        1,
   215  		RTTMillisecond:      5,
   216  		DataDir:             "data-4",
   217  		ServiceAddress:      "127.0.0.1:9032",
   218  		RaftAddress:         "127.0.0.1:9030",
   219  		GossipAddress:       "127.0.0.1:9031",
   220  		GossipSeedAddresses: []string{"127.0.0.1:9001", "127.0.0.1:9011", "127.0.0.1:9021"},
   221  		DisableWorkers:      true,
   222  	}
   223  	cfg4.HAKeeperConfig.TickPerSecond = 10
   224  	cfg4.HAKeeperConfig.LogStoreTimeout.Duration = 5 * time.Second
   225  	cfg4.HAKeeperConfig.DNStoreTimeout.Duration = 10 * time.Second
   226  	cfg4.HAKeeperConfig.CNStoreTimeout.Duration = 5 * time.Second
   227  	cfg1.Fill()
   228  	service1, err := NewService(cfg1,
   229  		testutil.NewFS(),
   230  		WithBackendFilter(func(msg morpc.Message, backendAddr string) bool {
   231  			return true
   232  		}),
   233  	)
   234  	require.NoError(t, err)
   235  	defer func() {
   236  		assert.NoError(t, service1.Close())
   237  	}()
   238  	cfg2.Fill()
   239  	service2, err := NewService(cfg2,
   240  		testutil.NewFS(),
   241  		WithBackendFilter(func(msg morpc.Message, backendAddr string) bool {
   242  			return true
   243  		}),
   244  	)
   245  	require.NoError(t, err)
   246  	defer func() {
   247  		assert.NoError(t, service2.Close())
   248  	}()
   249  	cfg3.Fill()
   250  	service3, err := NewService(cfg3,
   251  		testutil.NewFS(),
   252  		WithBackendFilter(func(msg morpc.Message, backendAddr string) bool {
   253  			return true
   254  		}),
   255  	)
   256  	require.NoError(t, err)
   257  	defer func() {
   258  		assert.NoError(t, service3.Close())
   259  	}()
   260  	cfg4.Fill()
   261  	service4, err := NewService(cfg4,
   262  		testutil.NewFS(),
   263  		WithBackendFilter(func(msg morpc.Message, backendAddr string) bool {
   264  			return true
   265  		}),
   266  	)
   267  	require.NoError(t, err)
   268  	defer func() {
   269  		assert.NoError(t, service4.Close())
   270  	}()
   271  
   272  	peers := make(map[uint64]dragonboat.Target)
   273  	peers[1] = service1.ID()
   274  	peers[2] = service2.ID()
   275  	peers[3] = service3.ID()
   276  	assert.NoError(t, service1.store.startHAKeeperReplica(1, peers, false))
   277  	assert.NoError(t, service2.store.startHAKeeperReplica(2, peers, false))
   278  	assert.NoError(t, service3.store.startHAKeeperReplica(3, peers, false))
   279  	fn(t, []*Service{service1, service2, service3, service4})
   280  }
   281  
   282  func TestHAKeeperCanBootstrapAndRepairShards(t *testing.T) {
   283  	fn := func(t *testing.T, services []*Service) {
   284  		// bootstrap the cluster, 1 DN 1 Log shard, Log and HAKeeper have
   285  		// 3 replicas
   286  		store1 := services[0].store
   287  		state, err := store1.getCheckerState()
   288  		require.NoError(t, err)
   289  		assert.Equal(t, pb.HAKeeperCreated, state.State)
   290  		require.NoError(t, store1.setInitialClusterInfo(1, 1, 3))
   291  		state, err = store1.getCheckerState()
   292  		require.NoError(t, err)
   293  		assert.Equal(t, pb.HAKeeperBootstrapping, state.State)
   294  
   295  		sendHeartbeat := func(ss []*Service) {
   296  			for _, s := range ss {
   297  				done := false
   298  				for i := 0; i < 10; i++ {
   299  					m := s.store.getHeartbeatMessage()
   300  					ctx, cancel := context.WithTimeout(context.Background(), time.Second)
   301  					defer cancel()
   302  					_, err := s.store.addLogStoreHeartbeat(ctx, m)
   303  					if err == dragonboat.ErrTimeout {
   304  						time.Sleep(100 * time.Millisecond)
   305  					} else {
   306  						if err == nil {
   307  							done = true
   308  							break
   309  						} else {
   310  							t.Fatalf("failed to add heartbeat %v", err)
   311  						}
   312  					}
   313  				}
   314  				if !done {
   315  					t.Fatalf("failed to add heartbeat after 10 retries")
   316  				}
   317  			}
   318  		}
   319  		sendHeartbeat(services[:3])
   320  
   321  		// fake a DN store
   322  		dnMsg := pb.DNStoreHeartbeat{
   323  			UUID:   uuid.New().String(),
   324  			Shards: make([]pb.DNShardInfo, 0),
   325  		}
   326  		ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second)
   327  		defer cancel()
   328  		_, err = services[0].store.addDNStoreHeartbeat(ctx, dnMsg)
   329  		require.NoError(t, err)
   330  
   331  		// find out the leader HAKeeper store as we need the term value
   332  		var term uint64
   333  		var leaderStore *store
   334  		for _, s := range services[:3] {
   335  			isLeader, curTerm, err := s.store.isLeaderHAKeeper()
   336  			require.NoError(t, err)
   337  			if isLeader {
   338  				term = curTerm
   339  				leaderStore = s.store
   340  				break
   341  			}
   342  		}
   343  		require.NotNil(t, leaderStore)
   344  		require.True(t, term > 0)
   345  
   346  		// bootstrap the cluster
   347  		state, err = leaderStore.getCheckerState()
   348  		require.NoError(t, err)
   349  		leaderStore.bootstrap(term, state)
   350  
   351  		state, err = leaderStore.getCheckerState()
   352  		require.NoError(t, err)
   353  		assert.Equal(t, pb.HAKeeperBootstrapCommandsReceived, state.State)
   354  		assert.Equal(t, uint64(checkBootstrapCycles), leaderStore.bootstrapCheckCycles)
   355  		require.NotNil(t, leaderStore.bootstrapMgr)
   356  		assert.False(t, leaderStore.bootstrapMgr.CheckBootstrap(state.LogState))
   357  
   358  		// get and apply all bootstrap schedule commands
   359  		for _, s := range services[:3] {
   360  			cb, err := s.store.getCommandBatch(ctx, s.store.id())
   361  			require.NoError(t, err)
   362  			if len(cb.Commands) > 0 {
   363  				s.handleStartReplica(cb.Commands[0])
   364  			}
   365  		}
   366  
   367  		// check bootstrap can be completed
   368  		for i := 0; i < 100; i++ {
   369  			sendHeartbeat(services[:3])
   370  			state, err = leaderStore.getCheckerState()
   371  			require.NoError(t, err)
   372  			leaderStore.checkBootstrap(state)
   373  
   374  			state, err = leaderStore.getCheckerState()
   375  			require.NoError(t, err)
   376  			if state.State != pb.HAKeeperRunning {
   377  				// FIXME: why wait here?
   378  				time.Sleep(50 * time.Millisecond)
   379  			} else {
   380  				break
   381  			}
   382  			if i == 99 {
   383  				t.Fatalf("failed to complete bootstrap")
   384  			}
   385  		}
   386  
   387  		// get the DN bootstrap command, it contains DN shard and replica ID
   388  		cb, err := leaderStore.getCommandBatch(ctx, dnMsg.UUID)
   389  		require.NoError(t, err)
   390  		require.Equal(t, 1, len(cb.Commands))
   391  		cmd := cb.Commands[0]
   392  		assert.True(t, cmd.Bootstrapping)
   393  		assert.Equal(t, pb.DNService, cmd.ServiceType)
   394  		dnShardInfo := pb.DNShardInfo{
   395  			ShardID:   cmd.ConfigChange.Replica.ShardID,
   396  			ReplicaID: cmd.ConfigChange.Replica.ReplicaID,
   397  		}
   398  		dnMsg.Shards = append(dnMsg.Shards, dnShardInfo)
   399  		// as if DN is running
   400  		_, err = services[0].store.addDNStoreHeartbeat(ctx, dnMsg)
   401  		require.NoError(t, err)
   402  		// fake a free DN store
   403  		dnMsg2 := pb.DNStoreHeartbeat{
   404  			UUID:   uuid.New().String(),
   405  			Shards: make([]pb.DNShardInfo, 0),
   406  		}
   407  		_, err = services[0].store.addDNStoreHeartbeat(ctx, dnMsg2)
   408  		require.NoError(t, err)
   409  
   410  		// stop store 1
   411  		require.NoError(t, services[0].Close())
   412  		// no service.Close can be repeatedly called
   413  		services[0].store = nil
   414  		services = services[1:]
   415  
   416  		// wait for HAKeeper to repair the Log & HAKeeper shards
   417  		dnRepaired := false
   418  		for i := 0; i < 5000; i++ {
   419  			testLogger.Info(fmt.Sprintf("iteration %d", i))
   420  			tn := func() (bool, error) {
   421  				ctx, cancel := context.WithTimeout(context.Background(), time.Second)
   422  				defer cancel()
   423  				m := services[0].store.getHeartbeatMessage()
   424  				if cb, err := services[0].store.addLogStoreHeartbeat(ctx, m); err != nil {
   425  					return false, err
   426  				} else {
   427  					services[0].handleCommands(cb.Commands)
   428  				}
   429  				m = services[1].store.getHeartbeatMessage()
   430  				if cb, err := services[1].store.addLogStoreHeartbeat(ctx, m); err != nil {
   431  					return false, err
   432  				} else {
   433  					services[1].handleCommands(cb.Commands)
   434  				}
   435  				m = services[2].store.getHeartbeatMessage()
   436  				if cb, err := services[0].store.addLogStoreHeartbeat(ctx, m); err != nil {
   437  					return false, err
   438  				} else {
   439  					services[2].handleCommands(cb.Commands)
   440  				}
   441  				if _, err := services[0].store.addDNStoreHeartbeat(ctx, dnMsg2); err != nil {
   442  					return false, err
   443  				}
   444  
   445  				for _, s := range services {
   446  					if hasShard(s.store, 0) {
   447  						s.store.hakeeperTick()
   448  						s.store.hakeeperCheck()
   449  					}
   450  
   451  					cb, err = services[0].store.getCommandBatch(ctx, dnMsg2.UUID)
   452  					if err != nil {
   453  						return false, err
   454  					}
   455  					if len(cb.Commands) > 0 {
   456  						cmd := cb.Commands[0]
   457  						if cmd.ServiceType == pb.DNService {
   458  							if cmd.ConfigChange != nil && cmd.ConfigChange.Replica.ShardID == dnShardInfo.ShardID &&
   459  								cmd.ConfigChange.Replica.ReplicaID > dnShardInfo.ReplicaID {
   460  								dnRepaired = true
   461  							}
   462  						}
   463  					}
   464  				}
   465  
   466  				logRepaired := true
   467  				for _, s := range services {
   468  					if !hasShard(s.store, 0) || !hasShard(s.store, 1) {
   469  						logRepaired = false
   470  						break
   471  					}
   472  				}
   473  				testLogger.Info(fmt.Sprintf("dnRepaired %t, logRepaired %t", dnRepaired, logRepaired))
   474  				if !logRepaired || !dnRepaired {
   475  					return false, nil
   476  				} else {
   477  					testLogger.Info(fmt.Sprintf("repair completed, i: %d", i))
   478  					return true, nil
   479  				}
   480  			}
   481  			completed, err := tn()
   482  			if err != nil && err != dragonboat.ErrTimeout &&
   483  				err != dragonboat.ErrInvalidDeadline && err != dragonboat.ErrTimeoutTooSmall {
   484  				t.Fatalf("unexpected error %v", err)
   485  			}
   486  			if completed {
   487  				for _, s := range services[:3] {
   488  					_ = s.task.holder.Close()
   489  				}
   490  				return
   491  			}
   492  			time.Sleep(5 * time.Millisecond)
   493  		}
   494  		t.Fatalf("failed to repair shards")
   495  	}
   496  	runHAKeeperClusterTest(t, fn)
   497  }
   498  
   499  func TestGetCheckerState(t *testing.T) {
   500  	fn := func(t *testing.T, store *store) {
   501  		state, err := store.getCheckerState()
   502  		require.NoError(t, err)
   503  		assert.Equal(t, pb.HAKeeperCreated, state.State)
   504  	}
   505  	runHAKeeperStoreTest(t, false, fn)
   506  }
   507  
   508  func TestSetInitialClusterInfo(t *testing.T) {
   509  	fn := func(t *testing.T, store *store) {
   510  		state, err := store.getCheckerState()
   511  		require.NoError(t, err)
   512  		assert.Equal(t, pb.HAKeeperCreated, state.State)
   513  		require.NoError(t, store.setInitialClusterInfo(1, 1, 1))
   514  		state, err = store.getCheckerState()
   515  		require.NoError(t, err)
   516  		assert.Equal(t, pb.HAKeeperBootstrapping, state.State)
   517  	}
   518  	runHAKeeperStoreTest(t, false, fn)
   519  }
   520  
   521  func TestFailedBootstrap(t *testing.T) {
   522  	testBootstrap(t, true)
   523  }
   524  
   525  func TestBootstrap(t *testing.T) {
   526  	testBootstrap(t, false)
   527  }
   528  
   529  func testBootstrap(t *testing.T, fail bool) {
   530  	fn := func(t *testing.T, store *store) {
   531  		state, err := store.getCheckerState()
   532  		require.NoError(t, err)
   533  		assert.Equal(t, pb.HAKeeperCreated, state.State)
   534  		require.NoError(t, store.setInitialClusterInfo(1, 1, 1))
   535  		state, err = store.getCheckerState()
   536  		require.NoError(t, err)
   537  		assert.Equal(t, pb.HAKeeperBootstrapping, state.State)
   538  		m := store.getHeartbeatMessage()
   539  		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
   540  		defer cancel()
   541  		_, err = store.addLogStoreHeartbeat(ctx, m)
   542  		assert.NoError(t, err)
   543  
   544  		dnMsg := pb.DNStoreHeartbeat{
   545  			UUID:   uuid.New().String(),
   546  			Shards: make([]pb.DNShardInfo, 0),
   547  		}
   548  		_, err = store.addDNStoreHeartbeat(ctx, dnMsg)
   549  		assert.NoError(t, err)
   550  
   551  		_, term, err := store.isLeaderHAKeeper()
   552  		require.NoError(t, err)
   553  
   554  		state, err = store.getCheckerState()
   555  		require.NoError(t, err)
   556  		store.bootstrap(term, state)
   557  
   558  		state, err = store.getCheckerState()
   559  		require.NoError(t, err)
   560  		assert.Equal(t, pb.HAKeeperBootstrapCommandsReceived, state.State)
   561  		assert.Equal(t, uint64(checkBootstrapCycles), store.bootstrapCheckCycles)
   562  		require.NotNil(t, store.bootstrapMgr)
   563  		assert.False(t, store.bootstrapMgr.CheckBootstrap(state.LogState))
   564  
   565  		if fail {
   566  			// keep checking, bootstrap will eventually be set as failed
   567  			for i := 0; i <= checkBootstrapCycles; i++ {
   568  				store.checkBootstrap(state)
   569  			}
   570  
   571  			state, err = store.getCheckerState()
   572  			require.NoError(t, err)
   573  			assert.Equal(t, pb.HAKeeperBootstrapFailed, state.State)
   574  		} else {
   575  			cb, err := store.getCommandBatch(ctx, dnMsg.UUID)
   576  			require.NoError(t, err)
   577  			require.Equal(t, 1, len(cb.Commands))
   578  			assert.True(t, cb.Commands[0].Bootstrapping)
   579  			assert.Equal(t, pb.DNService, cb.Commands[0].ServiceType)
   580  			assert.True(t, cb.Commands[0].ConfigChange.Replica.ReplicaID > 0)
   581  
   582  			cb, err = store.getCommandBatch(ctx, store.id())
   583  			require.NoError(t, err)
   584  			require.Equal(t, 1, len(cb.Commands))
   585  			assert.True(t, cb.Commands[0].Bootstrapping)
   586  			service := &Service{store: store}
   587  			service.handleStartReplica(cb.Commands[0])
   588  
   589  			for i := 0; i < 100; i++ {
   590  				ctx, cancel := context.WithTimeout(context.Background(), time.Second)
   591  				defer cancel()
   592  				m := store.getHeartbeatMessage()
   593  				_, err = store.addLogStoreHeartbeat(ctx, m)
   594  				assert.NoError(t, err)
   595  
   596  				state, err = store.getCheckerState()
   597  				require.NoError(t, err)
   598  				store.checkBootstrap(state)
   599  
   600  				state, err = store.getCheckerState()
   601  				require.NoError(t, err)
   602  				if state.State != pb.HAKeeperRunning {
   603  					time.Sleep(50 * time.Millisecond)
   604  				} else {
   605  					return
   606  				}
   607  				if i == 2999 {
   608  					t.Fatalf("failed to complete bootstrap")
   609  				}
   610  			}
   611  		}
   612  	}
   613  	runHAKeeperStoreTest(t, false, fn)
   614  }
   615  
   616  func TestTaskSchedulerCanScheduleTasksToCNs(t *testing.T) {
   617  	fn := func(t *testing.T, store *store, taskService taskservice.TaskService) {
   618  		state, err := store.getCheckerState()
   619  		require.NoError(t, err)
   620  		assert.Equal(t, pb.HAKeeperCreated, state.State)
   621  		require.NoError(t, store.setInitialClusterInfo(1, 1, 1))
   622  		state, err = store.getCheckerState()
   623  		require.NoError(t, err)
   624  		assert.Equal(t, pb.HAKeeperBootstrapping, state.State)
   625  		m := store.getHeartbeatMessage()
   626  		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
   627  		defer cancel()
   628  		_, err = store.addLogStoreHeartbeat(ctx, m)
   629  		assert.NoError(t, err)
   630  
   631  		_, term, err := store.isLeaderHAKeeper()
   632  		require.NoError(t, err)
   633  
   634  		state, err = store.getCheckerState()
   635  		require.NoError(t, err)
   636  		store.bootstrap(term, state)
   637  
   638  		state, err = store.getCheckerState()
   639  		require.NoError(t, err)
   640  		assert.Equal(t, pb.HAKeeperBootstrapCommandsReceived, state.State)
   641  		assert.Equal(t, uint64(checkBootstrapCycles), store.bootstrapCheckCycles)
   642  		require.NotNil(t, store.bootstrapMgr)
   643  		assert.False(t, store.bootstrapMgr.CheckBootstrap(state.LogState))
   644  
   645  		cb, err := store.getCommandBatch(ctx, store.id())
   646  		require.NoError(t, err)
   647  		require.Equal(t, 1, len(cb.Commands))
   648  		assert.True(t, cb.Commands[0].Bootstrapping)
   649  		service := &Service{store: store}
   650  		service.handleStartReplica(cb.Commands[0])
   651  
   652  		for i := 0; i < 100; i++ {
   653  			ctx, cancel := context.WithTimeout(context.Background(), time.Second)
   654  			defer cancel()
   655  			m := store.getHeartbeatMessage()
   656  			_, err = store.addLogStoreHeartbeat(ctx, m)
   657  			assert.NoError(t, err)
   658  
   659  			state, err = store.getCheckerState()
   660  			require.NoError(t, err)
   661  			store.checkBootstrap(state)
   662  
   663  			state, err = store.getCheckerState()
   664  			require.NoError(t, err)
   665  			if state.State != pb.HAKeeperRunning {
   666  				time.Sleep(50 * time.Millisecond)
   667  			} else {
   668  				break
   669  			}
   670  			if i == 2999 {
   671  				t.Fatalf("failed to complete bootstrap")
   672  			}
   673  		}
   674  
   675  		cnUUID1 := uuid.New().String()
   676  		cnMsg1 := pb.CNStoreHeartbeat{UUID: cnUUID1}
   677  		_, err = store.addCNStoreHeartbeat(ctx, cnMsg1)
   678  		assert.NoError(t, err)
   679  		err = taskService.Create(ctx, task.TaskMetadata{ID: "a"})
   680  		assert.NoError(t, err)
   681  		state, err = store.getCheckerState()
   682  		require.NoError(t, err)
   683  		tasks, err := taskService.QueryTask(ctx, taskservice.WithTaskRunnerCond(taskservice.EQ, cnUUID1))
   684  		assert.NoError(t, err)
   685  		assert.Equal(t, 0, len(tasks))
   686  		store.taskSchedule(state)
   687  		// update state
   688  		state, err = store.getCheckerState()
   689  		require.NoError(t, err)
   690  		store.taskSchedule(state)
   691  		tasks, err = taskService.QueryTask(ctx, taskservice.WithTaskRunnerCond(taskservice.EQ, cnUUID1))
   692  		assert.NoError(t, err)
   693  		assert.Equal(t, 1, len(tasks))
   694  
   695  		cnUUID2 := uuid.New().String()
   696  		cnMsg2 := pb.CNStoreHeartbeat{UUID: cnUUID2}
   697  		_, err = store.addCNStoreHeartbeat(ctx, cnMsg2)
   698  		assert.NoError(t, err)
   699  		err = taskService.Create(ctx, task.TaskMetadata{ID: "b"})
   700  		assert.NoError(t, err)
   701  		state, err = store.getCheckerState()
   702  		require.NoError(t, err)
   703  		tasks, err = taskService.QueryTask(ctx, taskservice.WithTaskRunnerCond(taskservice.EQ, cnUUID2))
   704  		assert.NoError(t, err)
   705  		assert.Equal(t, 0, len(tasks))
   706  		store.taskSchedule(state)
   707  		tasks, err = taskService.QueryTask(ctx, taskservice.WithTaskRunnerCond(taskservice.EQ, cnUUID2))
   708  		assert.NoError(t, err)
   709  		assert.Equal(t, 1, len(tasks))
   710  	}
   711  	runHakeeperTaskServiceTest(t, fn)
   712  }
   713  
   714  func TestTaskSchedulerCanReScheduleExpiredTasks(t *testing.T) {
   715  	fn := func(t *testing.T, store *store, taskService taskservice.TaskService) {
   716  		state, err := store.getCheckerState()
   717  		require.NoError(t, err)
   718  		assert.Equal(t, pb.HAKeeperCreated, state.State)
   719  		require.NoError(t, store.setInitialClusterInfo(1, 1, 1))
   720  		state, err = store.getCheckerState()
   721  		require.NoError(t, err)
   722  		assert.Equal(t, pb.HAKeeperBootstrapping, state.State)
   723  		m := store.getHeartbeatMessage()
   724  		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
   725  		defer cancel()
   726  		_, err = store.addLogStoreHeartbeat(ctx, m)
   727  		assert.NoError(t, err)
   728  
   729  		_, term, err := store.isLeaderHAKeeper()
   730  		require.NoError(t, err)
   731  
   732  		state, err = store.getCheckerState()
   733  		require.NoError(t, err)
   734  		store.bootstrap(term, state)
   735  
   736  		state, err = store.getCheckerState()
   737  		require.NoError(t, err)
   738  		assert.Equal(t, pb.HAKeeperBootstrapCommandsReceived, state.State)
   739  		assert.Equal(t, uint64(checkBootstrapCycles), store.bootstrapCheckCycles)
   740  		require.NotNil(t, store.bootstrapMgr)
   741  		assert.False(t, store.bootstrapMgr.CheckBootstrap(state.LogState))
   742  
   743  		cb, err := store.getCommandBatch(ctx, store.id())
   744  		require.NoError(t, err)
   745  		require.Equal(t, 1, len(cb.Commands))
   746  		assert.True(t, cb.Commands[0].Bootstrapping)
   747  		service := &Service{store: store}
   748  		service.handleStartReplica(cb.Commands[0])
   749  
   750  		for i := 0; i < 100; i++ {
   751  			ctx, cancel := context.WithTimeout(context.Background(), time.Second)
   752  			defer cancel()
   753  			m := store.getHeartbeatMessage()
   754  			_, err = store.addLogStoreHeartbeat(ctx, m)
   755  			assert.NoError(t, err)
   756  
   757  			state, err = store.getCheckerState()
   758  			require.NoError(t, err)
   759  			store.checkBootstrap(state)
   760  
   761  			state, err = store.getCheckerState()
   762  			require.NoError(t, err)
   763  			if state.State != pb.HAKeeperRunning {
   764  				time.Sleep(50 * time.Millisecond)
   765  			} else {
   766  				break
   767  			}
   768  			if i == 2999 {
   769  				t.Fatalf("failed to complete bootstrap")
   770  			}
   771  		}
   772  
   773  		cnUUID1 := uuid.New().String()
   774  		cnMsg1 := pb.CNStoreHeartbeat{UUID: cnUUID1}
   775  		_, err = store.addCNStoreHeartbeat(ctx, cnMsg1)
   776  		assert.NoError(t, err)
   777  		err = taskService.Create(ctx, task.TaskMetadata{ID: "a"})
   778  		assert.NoError(t, err)
   779  		state, err = store.getCheckerState()
   780  		require.NoError(t, err)
   781  		tasks, err := taskService.QueryTask(ctx, taskservice.WithTaskRunnerCond(taskservice.EQ, cnUUID1))
   782  		assert.NoError(t, err)
   783  		assert.Equal(t, 0, len(tasks))
   784  		store.taskSchedule(state)
   785  		// update state
   786  		state, err = store.getCheckerState()
   787  		require.NoError(t, err)
   788  		store.taskSchedule(state)
   789  		tasks, err = taskService.QueryTask(ctx, taskservice.WithTaskRunnerCond(taskservice.EQ, cnUUID1))
   790  		assert.NoError(t, err)
   791  		assert.Equal(t, 1, len(tasks))
   792  
   793  		cnUUID2 := uuid.New().String()
   794  		for i := 0; i < 1000; i++ {
   795  			testLogger.Info(fmt.Sprintf("iteration %d", i))
   796  			tn := func() bool {
   797  				ctx, cancel := context.WithTimeout(context.Background(), time.Second)
   798  				defer cancel()
   799  				cnMsg2 := pb.CNStoreHeartbeat{UUID: cnUUID2}
   800  				_, err = store.addCNStoreHeartbeat(ctx, cnMsg2)
   801  				assert.NoError(t, err)
   802  				state, err = store.getCheckerState()
   803  				require.NoError(t, err)
   804  				store.taskSchedule(state)
   805  				tasks, err = taskService.QueryTask(ctx, taskservice.WithTaskRunnerCond(taskservice.EQ, cnUUID2))
   806  				assert.NoError(t, err)
   807  				if len(tasks) == 0 {
   808  					testLogger.Info("no task found")
   809  					time.Sleep(50 * time.Millisecond)
   810  				} else {
   811  					tasks, err = taskService.QueryTask(ctx, taskservice.WithTaskRunnerCond(taskservice.EQ, cnUUID1))
   812  					assert.Equal(t, 0, len(tasks))
   813  					return true
   814  				}
   815  				return false
   816  			}
   817  			completed := tn()
   818  			if completed {
   819  				store.taskScheduler.StopScheduleCronTask()
   820  				return
   821  			}
   822  			time.Sleep(100 * time.Millisecond)
   823  		}
   824  		t.Fatalf("failed to reschedule expired tasks")
   825  	}
   826  	runHakeeperTaskServiceTest(t, fn)
   827  }
   828  
   829  func TestGetTaskTableUserFromEnv(t *testing.T) {
   830  	os.Setenv(moAdminUser, "root")
   831  	user, ok := getTaskTableUserFromEnv()
   832  	require.False(t, ok)
   833  	require.Equal(t, pb.TaskTableUser{}, user)
   834  
   835  	os.Setenv(moAdminPassword, "")
   836  	user, ok = getTaskTableUserFromEnv()
   837  	require.False(t, ok)
   838  	require.Equal(t, pb.TaskTableUser{}, user)
   839  
   840  	os.Setenv(moAdminPassword, "root")
   841  	user, ok = getTaskTableUserFromEnv()
   842  	require.True(t, ok)
   843  	require.Equal(t, pb.TaskTableUser{Username: "root", Password: "root"}, user)
   844  }