github.com/matrixorigin/matrixone@v1.2.0/pkg/logservice/store_hakeeper_check_test.go (about) 1 // Copyright 2021 - 2022 Matrix Origin 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package logservice 16 17 import ( 18 "context" 19 "fmt" 20 "testing" 21 "time" 22 23 "github.com/google/uuid" 24 "github.com/lni/dragonboat/v4" 25 "github.com/lni/goutils/leaktest" 26 "github.com/lni/vfs" 27 "github.com/stretchr/testify/assert" 28 "github.com/stretchr/testify/require" 29 30 "github.com/matrixorigin/matrixone/pkg/common/morpc" 31 "github.com/matrixorigin/matrixone/pkg/common/runtime" 32 "github.com/matrixorigin/matrixone/pkg/hakeeper" 33 pb "github.com/matrixorigin/matrixone/pkg/pb/logservice" 34 "github.com/matrixorigin/matrixone/pkg/pb/task" 35 "github.com/matrixorigin/matrixone/pkg/taskservice" 36 ) 37 38 func TestIDAllocatorDefaultState(t *testing.T) { 39 alloc := newIDAllocator() 40 assert.Equal(t, uint64(0), alloc.Capacity()) 41 v, ok := alloc.Next() 42 assert.False(t, ok) 43 assert.Equal(t, uint64(0), v) 44 } 45 46 func TestIDAllocatorCapacity(t *testing.T) { 47 tests := []struct { 48 next uint64 49 last uint64 50 capacity uint64 51 }{ 52 {1, 1, 1}, 53 {2, 1, 0}, 54 {1, 2, 2}, 55 {100, 200, 101}, 56 } 57 58 for _, tt := range tests { 59 alloc := idAllocator{nextID: tt.next, lastID: tt.last} 60 assert.Equal(t, tt.capacity, alloc.Capacity()) 61 } 62 } 63 64 func TestIDAllocatorSet(t *testing.T) { 65 alloc := idAllocator{nextID: 100, lastID: 200} 66 alloc.Set(hakeeper.K8SIDRangeEnd, hakeeper.K8SIDRangeEnd+100) 67 expected := idAllocator{ 68 nextID: hakeeper.K8SIDRangeEnd, 69 lastID: hakeeper.K8SIDRangeEnd + 100, 70 } 71 assert.Equal(t, expected, alloc) 72 } 73 74 func TestIDAllocatorRejectInvalidSetInput(t *testing.T) { 75 alloc := idAllocator{nextID: 100, lastID: 200} 76 defer func() { 77 if r := recover(); r == nil { 78 t.Fatalf("failed to trigger panic") 79 } 80 }() 81 alloc.Set(300, 400) 82 } 83 84 func TestIDAllocatorNext(t *testing.T) { 85 tests := []struct { 86 next uint64 87 last uint64 88 capacity uint64 89 }{ 90 {1, 1, 1}, 91 {2, 1, 0}, 92 {1, 2, 2}, 93 {100, 200, 101}, 94 } 95 96 for _, tt := range tests { 97 expected := tt.next 98 alloc := idAllocator{nextID: tt.next, lastID: tt.last} 99 for { 100 hasID := alloc.Capacity() != 0 101 v, ok := alloc.Next() 102 assert.Equal(t, hasID, ok) 103 if hasID { 104 assert.Equal(t, expected, v) 105 expected++ 106 } else { 107 assert.Equal(t, uint64(0), v) 108 break 109 } 110 } 111 } 112 } 113 114 func TestHandleBootstrapFailure(t *testing.T) { 115 defer func() { 116 if r := recover(); r == nil { 117 t.Fatalf("failed to trigger panic") 118 } 119 }() 120 s := store{} 121 s.handleBootstrapFailure() 122 } 123 124 func runHAKeeperStoreTest(t *testing.T, startLogReplica bool, fn func(*testing.T, *store)) { 125 defer leaktest.AfterTest(t)() 126 cfg := getStoreTestConfig() 127 defer vfs.ReportLeakedFD(cfg.FS, t) 128 store, err := getTestStore(cfg, startLogReplica, nil) 129 assert.NoError(t, err) 130 defer func() { 131 assert.NoError(t, store.close()) 132 }() 133 peers := make(map[uint64]dragonboat.Target) 134 peers[1] = store.id() 135 assert.NoError(t, store.startHAKeeperReplica(1, peers, false)) 136 fn(t, store) 137 } 138 139 func runHakeeperTaskServiceTest(t *testing.T, fn func(*testing.T, *store, taskservice.TaskService)) { 140 defer leaktest.AfterTest(t)() 141 cfg := getStoreTestConfig() 142 cfg.HAKeeperConfig.CNStoreTimeout.Duration = 5 * time.Second 143 defer vfs.ReportLeakedFD(cfg.FS, t) 144 145 taskService := taskservice.NewTaskService(runtime.DefaultRuntime(), taskservice.NewMemTaskStorage()) 146 defer taskService.StopScheduleCronTask() 147 148 store, err := getTestStore(cfg, false, taskService) 149 assert.NoError(t, err) 150 defer func() { 151 assert.NoError(t, store.close()) 152 }() 153 peers := make(map[uint64]dragonboat.Target) 154 peers[1] = store.id() 155 assert.NoError(t, store.startHAKeeperReplica(1, peers, false)) 156 fn(t, store, taskService) 157 } 158 159 func runHAKeeperClusterTest(t *testing.T, fn func(*testing.T, []*Service)) { 160 defer leaktest.AfterTest(t)() 161 cfg1 := DefaultConfig() 162 cfg1.UUID = uuid.New().String() 163 cfg1.FS = vfs.NewStrictMem() 164 cfg1.DeploymentID = 1 165 cfg1.RTTMillisecond = 5 166 cfg1.DataDir = "data-1" 167 cfg1.LogServicePort = 9002 168 cfg1.RaftPort = 9000 169 cfg1.GossipPort = 9001 170 cfg1.GossipSeedAddresses = []string{"127.0.0.1:9011", "127.0.0.1:9021", "127.0.0.1:9031"} 171 cfg1.DisableWorkers = true 172 cfg1.HAKeeperConfig.TickPerSecond = 10 173 cfg1.HAKeeperConfig.LogStoreTimeout.Duration = 5 * time.Second 174 cfg1.HAKeeperConfig.TNStoreTimeout.Duration = 10 * time.Second 175 cfg1.HAKeeperConfig.CNStoreTimeout.Duration = 5 * time.Second 176 cfg2 := DefaultConfig() 177 cfg2.UUID = uuid.New().String() 178 cfg2.FS = vfs.NewStrictMem() 179 cfg2.DeploymentID = 1 180 cfg2.RTTMillisecond = 5 181 cfg2.DataDir = "data-2" 182 cfg2.LogServicePort = 9012 183 cfg2.RaftPort = 9010 184 cfg2.GossipPort = 9011 185 cfg2.GossipSeedAddresses = []string{"127.0.0.1:9001", "127.0.0.1:9021", "127.0.0.1:9031"} 186 cfg2.DisableWorkers = true 187 cfg2.HAKeeperConfig.TickPerSecond = 10 188 cfg2.HAKeeperConfig.LogStoreTimeout.Duration = 5 * time.Second 189 cfg2.HAKeeperConfig.TNStoreTimeout.Duration = 10 * time.Second 190 cfg2.HAKeeperConfig.CNStoreTimeout.Duration = 5 * time.Second 191 cfg3 := DefaultConfig() 192 cfg3.UUID = uuid.New().String() 193 cfg3.FS = vfs.NewStrictMem() 194 cfg3.DeploymentID = 1 195 cfg3.RTTMillisecond = 5 196 cfg3.DataDir = "data-3" 197 cfg3.LogServicePort = 9022 198 cfg3.RaftPort = 9020 199 cfg3.GossipPort = 9021 200 cfg3.GossipSeedAddresses = []string{"127.0.0.1:9001", "127.0.0.1:9011", "127.0.0.1:9031"} 201 cfg3.DisableWorkers = true 202 cfg3.HAKeeperConfig.TickPerSecond = 10 203 cfg3.HAKeeperConfig.LogStoreTimeout.Duration = 5 * time.Second 204 cfg3.HAKeeperConfig.TNStoreTimeout.Duration = 10 * time.Second 205 cfg3.HAKeeperConfig.CNStoreTimeout.Duration = 5 * time.Second 206 cfg4 := DefaultConfig() 207 cfg4.UUID = uuid.New().String() 208 cfg4.FS = vfs.NewStrictMem() 209 cfg4.DeploymentID = 1 210 cfg4.RTTMillisecond = 5 211 cfg4.DataDir = "data-4" 212 cfg4.LogServicePort = 9032 213 cfg4.RaftPort = 9030 214 cfg4.GossipPort = 9031 215 cfg4.GossipSeedAddresses = []string{"127.0.0.1:9001", "127.0.0.1:9011", "127.0.0.1:9021"} 216 cfg4.DisableWorkers = true 217 cfg4.HAKeeperConfig.TickPerSecond = 10 218 cfg4.HAKeeperConfig.LogStoreTimeout.Duration = 5 * time.Second 219 cfg4.HAKeeperConfig.TNStoreTimeout.Duration = 10 * time.Second 220 cfg4.HAKeeperConfig.CNStoreTimeout.Duration = 5 * time.Second 221 service1, err := NewService(cfg1, 222 newFS(), 223 nil, 224 WithBackendFilter(func(msg morpc.Message, backendAddr string) bool { 225 return true 226 }), 227 ) 228 require.NoError(t, err) 229 defer func() { 230 assert.NoError(t, service1.Close()) 231 }() 232 service2, err := NewService(cfg2, 233 newFS(), 234 nil, 235 WithBackendFilter(func(msg morpc.Message, backendAddr string) bool { 236 return true 237 }), 238 ) 239 require.NoError(t, err) 240 defer func() { 241 assert.NoError(t, service2.Close()) 242 }() 243 service3, err := NewService(cfg3, 244 newFS(), 245 nil, 246 WithBackendFilter(func(msg morpc.Message, backendAddr string) bool { 247 return true 248 }), 249 ) 250 require.NoError(t, err) 251 defer func() { 252 assert.NoError(t, service3.Close()) 253 }() 254 service4, err := NewService(cfg4, 255 newFS(), 256 nil, 257 WithBackendFilter(func(msg morpc.Message, backendAddr string) bool { 258 return true 259 }), 260 ) 261 require.NoError(t, err) 262 defer func() { 263 assert.NoError(t, service4.Close()) 264 }() 265 266 peers := make(map[uint64]dragonboat.Target) 267 peers[1] = service1.ID() 268 peers[2] = service2.ID() 269 peers[3] = service3.ID() 270 assert.NoError(t, service1.store.startHAKeeperReplica(1, peers, false)) 271 assert.NoError(t, service2.store.startHAKeeperReplica(2, peers, false)) 272 assert.NoError(t, service3.store.startHAKeeperReplica(3, peers, false)) 273 fn(t, []*Service{service1, service2, service3, service4}) 274 } 275 276 func TestHAKeeperCanBootstrapAndRepairShards(t *testing.T) { 277 fn := func(t *testing.T, services []*Service) { 278 // bootstrap the cluster, 1 TN 1 Log shard, Log and HAKeeper have 279 // 3 replicas 280 hakeeperDefaultTimeout = 10 * time.Second 281 282 store1 := services[0].store 283 state, err := store1.getCheckerState() 284 require.NoError(t, err) 285 assert.Equal(t, pb.HAKeeperCreated, state.State) 286 nextIDByKey := map[string]uint64{"a": 1, "b": 2} 287 require.NoError(t, store1.setInitialClusterInfo(1, 1, 3, hakeeper.K8SIDRangeEnd+10, nextIDByKey)) 288 state, err = store1.getCheckerState() 289 require.NoError(t, err) 290 assert.Equal(t, pb.HAKeeperBootstrapping, state.State) 291 assert.Equal(t, hakeeper.K8SIDRangeEnd+10, state.NextId) 292 assert.Equal(t, nextIDByKey, state.NextIDByKey) 293 294 sendHeartbeat := func(ss []*Service) { 295 for _, s := range ss { 296 done := false 297 for i := 0; i < 10; i++ { 298 m := s.store.getHeartbeatMessage() 299 ctx, cancel := context.WithTimeout(context.Background(), time.Second) 300 defer cancel() 301 _, err := s.store.addLogStoreHeartbeat(ctx, m) 302 if err == dragonboat.ErrTimeout { 303 time.Sleep(100 * time.Millisecond) 304 } else { 305 if err == nil { 306 done = true 307 break 308 } else { 309 t.Fatalf("failed to add heartbeat %v", err) 310 } 311 } 312 } 313 if !done { 314 t.Fatalf("failed to add heartbeat after 10 retries") 315 } 316 } 317 } 318 sendHeartbeat(services[:3]) 319 320 // fake a TN store 321 tnMsg := pb.TNStoreHeartbeat{ 322 UUID: uuid.New().String(), 323 Shards: make([]pb.TNShardInfo, 0), 324 } 325 ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second) 326 defer cancel() 327 _, err = services[0].store.addTNStoreHeartbeat(ctx, tnMsg) 328 require.NoError(t, err) 329 330 // find out the leader HAKeeper store as we need the term value 331 var term uint64 332 var leaderStore *store 333 for _, s := range services[:3] { 334 isLeader, curTerm, err := s.store.isLeaderHAKeeper() 335 require.NoError(t, err) 336 if isLeader { 337 term = curTerm 338 leaderStore = s.store 339 break 340 } 341 } 342 require.NotNil(t, leaderStore) 343 require.True(t, term > 0) 344 345 // bootstrap the cluster 346 state, err = leaderStore.getCheckerState() 347 require.NoError(t, err) 348 leaderStore.bootstrap(term, state) 349 350 state, err = leaderStore.getCheckerState() 351 require.NoError(t, err) 352 assert.Equal(t, pb.HAKeeperBootstrapCommandsReceived, state.State) 353 assert.Equal(t, uint64(checkBootstrapCycles), leaderStore.bootstrapCheckCycles) 354 require.NotNil(t, leaderStore.bootstrapMgr) 355 assert.False(t, leaderStore.bootstrapMgr.CheckBootstrap(state.LogState)) 356 357 // get and apply all bootstrap schedule commands 358 for _, s := range services[:3] { 359 cb, err := s.store.getCommandBatch(ctx, s.store.id()) 360 require.NoError(t, err) 361 if len(cb.Commands) > 0 { 362 s.handleStartReplica(cb.Commands[0]) 363 } 364 } 365 366 // check bootstrap can be completed 367 for i := 0; i < 100; i++ { 368 sendHeartbeat(services[:3]) 369 state, err = leaderStore.getCheckerState() 370 require.NoError(t, err) 371 leaderStore.checkBootstrap(state) 372 373 state, err = leaderStore.getCheckerState() 374 require.NoError(t, err) 375 if state.State != pb.HAKeeperRunning { 376 // FIXME: why wait here? 377 time.Sleep(50 * time.Millisecond) 378 } else { 379 break 380 } 381 if i == 99 { 382 t.Fatalf("failed to complete bootstrap") 383 } 384 } 385 386 // get the TN bootstrap command, it contains TN shard and replica ID 387 cb, err := leaderStore.getCommandBatch(ctx, tnMsg.UUID) 388 require.NoError(t, err) 389 require.Equal(t, 1, len(cb.Commands)) 390 cmd := cb.Commands[0] 391 assert.True(t, cmd.Bootstrapping) 392 assert.Equal(t, pb.TNService, cmd.ServiceType) 393 tnShardInfo := pb.TNShardInfo{ 394 ShardID: cmd.ConfigChange.Replica.ShardID, 395 ReplicaID: cmd.ConfigChange.Replica.ReplicaID, 396 } 397 tnMsg.Shards = append(tnMsg.Shards, tnShardInfo) 398 // as if TN is running 399 _, err = services[0].store.addTNStoreHeartbeat(ctx, tnMsg) 400 require.NoError(t, err) 401 // fake a free TN store 402 tnMsg2 := pb.TNStoreHeartbeat{ 403 UUID: uuid.New().String(), 404 Shards: make([]pb.TNShardInfo, 0), 405 } 406 _, err = services[0].store.addTNStoreHeartbeat(ctx, tnMsg2) 407 require.NoError(t, err) 408 409 // stop store 1 410 require.NoError(t, services[0].Close()) 411 // no service.Close can be repeatedly called 412 services[0].store = nil 413 services = services[1:] 414 415 // wait for HAKeeper to repair the Log & HAKeeper shards 416 tnRepaired := false 417 for i := 0; i < 5000; i++ { 418 testLogger.Debug(fmt.Sprintf("iteration %d", i)) 419 tn := func() (bool, error) { 420 ctx, cancel := context.WithTimeout(context.Background(), time.Second) 421 defer cancel() 422 m := services[0].store.getHeartbeatMessage() 423 if cb, err := services[0].store.addLogStoreHeartbeat(ctx, m); err != nil { 424 return false, err 425 } else { 426 services[0].handleCommands(cb.Commands) 427 } 428 m = services[1].store.getHeartbeatMessage() 429 if cb, err := services[1].store.addLogStoreHeartbeat(ctx, m); err != nil { 430 return false, err 431 } else { 432 services[1].handleCommands(cb.Commands) 433 } 434 m = services[2].store.getHeartbeatMessage() 435 if cb, err := services[0].store.addLogStoreHeartbeat(ctx, m); err != nil { 436 return false, err 437 } else { 438 services[2].handleCommands(cb.Commands) 439 } 440 if _, err := services[0].store.addTNStoreHeartbeat(ctx, tnMsg2); err != nil { 441 return false, err 442 } 443 444 for _, s := range services { 445 if hasShard(s.store, 0) { 446 s.store.hakeeperTick() 447 s.store.hakeeperCheck() 448 } 449 450 cb, err = services[0].store.getCommandBatch(ctx, tnMsg2.UUID) 451 if err != nil { 452 return false, err 453 } 454 if len(cb.Commands) > 0 { 455 cmd := cb.Commands[0] 456 if cmd.ServiceType == pb.TNService { 457 if cmd.ConfigChange != nil && cmd.ConfigChange.Replica.ShardID == tnShardInfo.ShardID && 458 cmd.ConfigChange.Replica.ReplicaID > tnShardInfo.ReplicaID { 459 tnRepaired = true 460 } 461 } 462 } 463 } 464 465 logRepaired := true 466 for _, s := range services { 467 if !hasShard(s.store, 0) || !hasShard(s.store, 1) { 468 logRepaired = false 469 break 470 } 471 } 472 testLogger.Debug(fmt.Sprintf("dnRepaired %t, logRepaired %t", tnRepaired, logRepaired)) 473 if !logRepaired || !tnRepaired { 474 return false, nil 475 } else { 476 testLogger.Debug(fmt.Sprintf("repair completed, i: %d", i)) 477 return true, nil 478 } 479 } 480 completed, err := tn() 481 if err != nil && err != dragonboat.ErrTimeout && 482 err != dragonboat.ErrInvalidDeadline && err != dragonboat.ErrTimeoutTooSmall { 483 t.Fatalf("unexpected error %v", err) 484 } 485 if completed { 486 for _, s := range services[:3] { 487 _ = s.task.holder.Close() 488 } 489 return 490 } 491 time.Sleep(5 * time.Millisecond) 492 } 493 t.Fatalf("failed to repair shards") 494 } 495 runHAKeeperClusterTest(t, fn) 496 } 497 498 func TestGetCheckerStateFromLeader(t *testing.T) { 499 fn := func(t *testing.T, store *store) { 500 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(time.Second*10)) 501 defer cancel() 502 503 for { 504 select { 505 case <-ctx.Done(): 506 t.Error("test deadline reached") 507 return 508 509 default: 510 isLeader, termA, err := store.isLeaderHAKeeper() 511 state, termB := store.getCheckerStateFromLeader() 512 require.NoError(t, err) 513 assert.Equal(t, termB, termA) 514 515 if !isLeader { 516 assert.Equal(t, (*pb.CheckerState)(nil), state) 517 } else { 518 assert.NotEqual(t, (*pb.CheckerState)(nil), state) 519 return 520 } 521 time.Sleep(time.Second) 522 } 523 } 524 } 525 526 runHAKeeperStoreTest(t, false, fn) 527 } 528 529 func TestGetCheckerState(t *testing.T) { 530 fn := func(t *testing.T, store *store) { 531 state, err := store.getCheckerState() 532 require.NoError(t, err) 533 assert.Equal(t, pb.HAKeeperCreated, state.State) 534 } 535 runHAKeeperStoreTest(t, false, fn) 536 } 537 538 func TestSetInitialClusterInfo(t *testing.T) { 539 fn := func(t *testing.T, store *store) { 540 state, err := store.getCheckerState() 541 require.NoError(t, err) 542 assert.Equal(t, pb.HAKeeperCreated, state.State) 543 nextIDByKey := map[string]uint64{"a": 1, "b": 2} 544 require.NoError(t, store.setInitialClusterInfo(1, 1, 1, hakeeper.K8SIDRangeEnd+10, nextIDByKey)) 545 state, err = store.getCheckerState() 546 require.NoError(t, err) 547 assert.Equal(t, pb.HAKeeperBootstrapping, state.State) 548 assert.Equal(t, hakeeper.K8SIDRangeEnd+10, state.NextId) 549 assert.Equal(t, nextIDByKey, state.NextIDByKey) 550 } 551 runHAKeeperStoreTest(t, false, fn) 552 } 553 554 func TestFailedBootstrap(t *testing.T) { 555 testBootstrap(t, true) 556 } 557 558 func TestBootstrap(t *testing.T) { 559 testBootstrap(t, false) 560 } 561 562 func testBootstrap(t *testing.T, fail bool) { 563 fn := func(t *testing.T, store *store) { 564 state, err := store.getCheckerState() 565 require.NoError(t, err) 566 assert.Equal(t, pb.HAKeeperCreated, state.State) 567 nextIDByKey := map[string]uint64{"a": 1, "b": 2} 568 require.NoError(t, store.setInitialClusterInfo(1, 1, 1, hakeeper.K8SIDRangeEnd+10, nextIDByKey)) 569 state, err = store.getCheckerState() 570 require.NoError(t, err) 571 assert.Equal(t, pb.HAKeeperBootstrapping, state.State) 572 assert.Equal(t, hakeeper.K8SIDRangeEnd+10, state.NextId) 573 assert.Equal(t, nextIDByKey, state.NextIDByKey) 574 m := store.getHeartbeatMessage() 575 ctx, cancel := context.WithTimeout(context.Background(), time.Second) 576 defer cancel() 577 _, err = store.addLogStoreHeartbeat(ctx, m) 578 assert.NoError(t, err) 579 580 tnMsg := pb.TNStoreHeartbeat{ 581 UUID: uuid.New().String(), 582 Shards: make([]pb.TNShardInfo, 0), 583 } 584 _, err = store.addTNStoreHeartbeat(ctx, tnMsg) 585 assert.NoError(t, err) 586 587 _, term, err := store.isLeaderHAKeeper() 588 require.NoError(t, err) 589 590 state, err = store.getCheckerState() 591 require.NoError(t, err) 592 store.bootstrap(term, state) 593 594 state, err = store.getCheckerState() 595 require.NoError(t, err) 596 assert.Equal(t, pb.HAKeeperBootstrapCommandsReceived, state.State) 597 assert.Equal(t, uint64(checkBootstrapCycles), store.bootstrapCheckCycles) 598 require.NotNil(t, store.bootstrapMgr) 599 assert.False(t, store.bootstrapMgr.CheckBootstrap(state.LogState)) 600 601 if fail { 602 // keep checking, bootstrap will eventually be set as failed 603 for i := 0; i <= checkBootstrapCycles; i++ { 604 store.checkBootstrap(state) 605 } 606 607 state, err = store.getCheckerState() 608 require.NoError(t, err) 609 assert.Equal(t, pb.HAKeeperBootstrapFailed, state.State) 610 } else { 611 cb, err := store.getCommandBatch(ctx, tnMsg.UUID) 612 require.NoError(t, err) 613 require.Equal(t, 1, len(cb.Commands)) 614 assert.True(t, cb.Commands[0].Bootstrapping) 615 assert.Equal(t, pb.TNService, cb.Commands[0].ServiceType) 616 assert.True(t, cb.Commands[0].ConfigChange.Replica.ReplicaID > 0) 617 618 cb, err = store.getCommandBatch(ctx, store.id()) 619 require.NoError(t, err) 620 require.Equal(t, 1, len(cb.Commands)) 621 assert.True(t, cb.Commands[0].Bootstrapping) 622 service := &Service{store: store} 623 service.handleStartReplica(cb.Commands[0]) 624 625 for i := 0; i < 100; i++ { 626 ctx, cancel := context.WithTimeout(context.Background(), time.Second) 627 defer cancel() 628 m := store.getHeartbeatMessage() 629 _, err = store.addLogStoreHeartbeat(ctx, m) 630 assert.NoError(t, err) 631 632 state, err = store.getCheckerState() 633 require.NoError(t, err) 634 store.checkBootstrap(state) 635 636 state, err = store.getCheckerState() 637 require.NoError(t, err) 638 if state.State != pb.HAKeeperRunning { 639 time.Sleep(50 * time.Millisecond) 640 } else { 641 return 642 } 643 if i == 2999 { 644 t.Fatalf("failed to complete bootstrap") 645 } 646 } 647 } 648 } 649 runHAKeeperStoreTest(t, false, fn) 650 } 651 652 func TestTaskSchedulerCanScheduleTasksToCNs(t *testing.T) { 653 fn := func(t *testing.T, store *store, taskService taskservice.TaskService) { 654 state, err := store.getCheckerState() 655 require.NoError(t, err) 656 assert.Equal(t, pb.HAKeeperCreated, state.State) 657 nextIDByKey := map[string]uint64{"a": 1, "b": 2} 658 require.NoError(t, store.setInitialClusterInfo(1, 1, 1, hakeeper.K8SIDRangeEnd+10, nextIDByKey)) 659 state, err = store.getCheckerState() 660 require.NoError(t, err) 661 assert.Equal(t, pb.HAKeeperBootstrapping, state.State) 662 assert.Equal(t, hakeeper.K8SIDRangeEnd+10, state.NextId) 663 assert.Equal(t, nextIDByKey, state.NextIDByKey) 664 m := store.getHeartbeatMessage() 665 ctx, cancel := context.WithTimeout(context.Background(), time.Second) 666 defer cancel() 667 _, err = store.addLogStoreHeartbeat(ctx, m) 668 assert.NoError(t, err) 669 670 _, term, err := store.isLeaderHAKeeper() 671 require.NoError(t, err) 672 673 state, err = store.getCheckerState() 674 require.NoError(t, err) 675 store.bootstrap(term, state) 676 677 state, err = store.getCheckerState() 678 require.NoError(t, err) 679 assert.Equal(t, pb.HAKeeperBootstrapCommandsReceived, state.State) 680 assert.Equal(t, uint64(checkBootstrapCycles), store.bootstrapCheckCycles) 681 require.NotNil(t, store.bootstrapMgr) 682 assert.False(t, store.bootstrapMgr.CheckBootstrap(state.LogState)) 683 684 cb, err := store.getCommandBatch(ctx, store.id()) 685 require.NoError(t, err) 686 require.Equal(t, 1, len(cb.Commands)) 687 assert.True(t, cb.Commands[0].Bootstrapping) 688 service := &Service{store: store} 689 service.handleStartReplica(cb.Commands[0]) 690 691 for i := 0; i < 100; i++ { 692 ctx, cancel := context.WithTimeout(context.Background(), time.Second) 693 defer cancel() 694 m := store.getHeartbeatMessage() 695 _, err = store.addLogStoreHeartbeat(ctx, m) 696 assert.NoError(t, err) 697 698 state, err = store.getCheckerState() 699 require.NoError(t, err) 700 store.checkBootstrap(state) 701 702 state, err = store.getCheckerState() 703 require.NoError(t, err) 704 if state.State != pb.HAKeeperRunning { 705 time.Sleep(50 * time.Millisecond) 706 } else { 707 break 708 } 709 if i == 2999 { 710 t.Fatalf("failed to complete bootstrap") 711 } 712 } 713 714 cnUUID1 := uuid.New().String() 715 cnMsg1 := pb.CNStoreHeartbeat{UUID: cnUUID1} 716 _, err = store.addCNStoreHeartbeat(ctx, cnMsg1) 717 assert.NoError(t, err) 718 err = taskService.CreateAsyncTask(ctx, task.TaskMetadata{ID: "a"}) 719 assert.NoError(t, err) 720 state, err = store.getCheckerState() 721 require.NoError(t, err) 722 tasks, err := taskService.QueryAsyncTask(ctx, taskservice.WithTaskRunnerCond(taskservice.EQ, cnUUID1)) 723 assert.NoError(t, err) 724 assert.Equal(t, 0, len(tasks)) 725 store.taskSchedule(state) 726 // update state 727 state, err = store.getCheckerState() 728 require.NoError(t, err) 729 store.taskSchedule(state) 730 tasks, err = taskService.QueryAsyncTask(ctx, taskservice.WithTaskRunnerCond(taskservice.EQ, cnUUID1)) 731 assert.NoError(t, err) 732 assert.Equal(t, 1, len(tasks)) 733 734 cnUUID2 := uuid.New().String() 735 cnMsg2 := pb.CNStoreHeartbeat{UUID: cnUUID2} 736 _, err = store.addCNStoreHeartbeat(ctx, cnMsg2) 737 assert.NoError(t, err) 738 err = taskService.CreateAsyncTask(ctx, task.TaskMetadata{ID: "b"}) 739 assert.NoError(t, err) 740 state, err = store.getCheckerState() 741 require.NoError(t, err) 742 tasks, err = taskService.QueryAsyncTask(ctx, taskservice.WithTaskRunnerCond(taskservice.EQ, cnUUID2)) 743 assert.NoError(t, err) 744 assert.Equal(t, 0, len(tasks)) 745 store.taskSchedule(state) 746 tasks, err = taskService.QueryAsyncTask(ctx, taskservice.WithTaskRunnerCond(taskservice.EQ, cnUUID2)) 747 assert.NoError(t, err) 748 assert.Equal(t, 1, len(tasks)) 749 } 750 runHakeeperTaskServiceTest(t, fn) 751 } 752 753 func TestTaskSchedulerCanReScheduleExpiredTasks(t *testing.T) { 754 fn := func(t *testing.T, store *store, taskService taskservice.TaskService) { 755 state, err := store.getCheckerState() 756 require.NoError(t, err) 757 assert.Equal(t, pb.HAKeeperCreated, state.State) 758 nextIDByKey := map[string]uint64{"a": 1, "b": 2} 759 require.NoError(t, store.setInitialClusterInfo(1, 1, 1, hakeeper.K8SIDRangeEnd+10, nextIDByKey)) 760 state, err = store.getCheckerState() 761 require.NoError(t, err) 762 assert.Equal(t, pb.HAKeeperBootstrapping, state.State) 763 assert.Equal(t, hakeeper.K8SIDRangeEnd+10, state.NextId) 764 assert.Equal(t, nextIDByKey, state.NextIDByKey) 765 m := store.getHeartbeatMessage() 766 ctx, cancel := context.WithTimeout(context.Background(), time.Second) 767 defer cancel() 768 _, err = store.addLogStoreHeartbeat(ctx, m) 769 assert.NoError(t, err) 770 771 _, term, err := store.isLeaderHAKeeper() 772 require.NoError(t, err) 773 774 state, err = store.getCheckerState() 775 require.NoError(t, err) 776 store.bootstrap(term, state) 777 778 state, err = store.getCheckerState() 779 require.NoError(t, err) 780 assert.Equal(t, pb.HAKeeperBootstrapCommandsReceived, state.State) 781 assert.Equal(t, uint64(checkBootstrapCycles), store.bootstrapCheckCycles) 782 require.NotNil(t, store.bootstrapMgr) 783 assert.False(t, store.bootstrapMgr.CheckBootstrap(state.LogState)) 784 785 cb, err := store.getCommandBatch(ctx, store.id()) 786 require.NoError(t, err) 787 require.Equal(t, 1, len(cb.Commands)) 788 assert.True(t, cb.Commands[0].Bootstrapping) 789 service := &Service{store: store} 790 service.handleStartReplica(cb.Commands[0]) 791 792 for i := 0; i < 100; i++ { 793 ctx, cancel := context.WithTimeout(context.Background(), time.Second) 794 defer cancel() 795 m := store.getHeartbeatMessage() 796 _, err = store.addLogStoreHeartbeat(ctx, m) 797 assert.NoError(t, err) 798 799 state, err = store.getCheckerState() 800 require.NoError(t, err) 801 store.checkBootstrap(state) 802 803 state, err = store.getCheckerState() 804 require.NoError(t, err) 805 if state.State != pb.HAKeeperRunning { 806 time.Sleep(50 * time.Millisecond) 807 } else { 808 break 809 } 810 if i == 2999 { 811 t.Fatalf("failed to complete bootstrap") 812 } 813 } 814 815 cnUUID1 := uuid.New().String() 816 cnMsg1 := pb.CNStoreHeartbeat{UUID: cnUUID1} 817 _, err = store.addCNStoreHeartbeat(ctx, cnMsg1) 818 assert.NoError(t, err) 819 err = taskService.CreateAsyncTask(ctx, task.TaskMetadata{ID: "a"}) 820 assert.NoError(t, err) 821 state, err = store.getCheckerState() 822 require.NoError(t, err) 823 tasks, err := taskService.QueryAsyncTask(ctx, taskservice.WithTaskRunnerCond(taskservice.EQ, cnUUID1)) 824 assert.NoError(t, err) 825 assert.Equal(t, 0, len(tasks)) 826 store.taskSchedule(state) 827 // update state 828 state, err = store.getCheckerState() 829 require.NoError(t, err) 830 store.taskSchedule(state) 831 tasks, err = taskService.QueryAsyncTask(ctx, taskservice.WithTaskRunnerCond(taskservice.EQ, cnUUID1)) 832 assert.NoError(t, err) 833 assert.Equal(t, 1, len(tasks)) 834 835 cnUUID2 := uuid.New().String() 836 for i := 0; i < 1000; i++ { 837 testLogger.Debug(fmt.Sprintf("iteration %d", i)) 838 tn := func() bool { 839 ctx, cancel := context.WithTimeout(context.Background(), time.Second) 840 defer cancel() 841 cnMsg2 := pb.CNStoreHeartbeat{UUID: cnUUID2} 842 _, err = store.addCNStoreHeartbeat(ctx, cnMsg2) 843 assert.NoError(t, err) 844 state, err = store.getCheckerState() 845 require.NoError(t, err) 846 store.taskSchedule(state) 847 tasks, err = taskService.QueryAsyncTask(ctx, taskservice.WithTaskRunnerCond(taskservice.EQ, cnUUID2)) 848 assert.NoError(t, err) 849 if len(tasks) == 0 { 850 testLogger.Info("no task found") 851 time.Sleep(50 * time.Millisecond) 852 } else { 853 tasks, err = taskService.QueryAsyncTask(ctx, taskservice.WithTaskRunnerCond(taskservice.EQ, cnUUID1)) 854 assert.Equal(t, 0, len(tasks)) 855 return true 856 } 857 return false 858 } 859 completed := tn() 860 if completed { 861 store.taskScheduler.StopScheduleCronTask() 862 return 863 } 864 time.Sleep(100 * time.Millisecond) 865 } 866 t.Fatalf("failed to reschedule expired tasks") 867 } 868 runHakeeperTaskServiceTest(t, fn) 869 } 870 871 func TestGetTaskTableUserFromEnv(t *testing.T) { 872 t.Setenv(moAdminUser, "root") 873 user, ok := getTaskTableUserFromEnv() 874 require.False(t, ok) 875 require.Equal(t, pb.TaskTableUser{}, user) 876 877 t.Setenv(moAdminPassword, "") 878 user, ok = getTaskTableUserFromEnv() 879 require.False(t, ok) 880 require.Equal(t, pb.TaskTableUser{}, user) 881 882 t.Setenv(moAdminPassword, "root") 883 user, ok = getTaskTableUserFromEnv() 884 require.True(t, ok) 885 require.Equal(t, pb.TaskTableUser{Username: "root", Password: "root"}, user) 886 }