github.com/matrixorigin/matrixone@v0.7.0/pkg/logservice/store_hakeeper_check_test.go (about) 1 // Copyright 2021 - 2022 Matrix Origin 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package logservice 16 17 import ( 18 "context" 19 "fmt" 20 "os" 21 "testing" 22 "time" 23 24 "github.com/google/uuid" 25 "github.com/lni/dragonboat/v4" 26 "github.com/lni/goutils/leaktest" 27 "github.com/lni/vfs" 28 "github.com/stretchr/testify/assert" 29 "github.com/stretchr/testify/require" 30 31 "github.com/matrixorigin/matrixone/pkg/common/morpc" 32 "github.com/matrixorigin/matrixone/pkg/common/runtime" 33 "github.com/matrixorigin/matrixone/pkg/hakeeper" 34 pb "github.com/matrixorigin/matrixone/pkg/pb/logservice" 35 "github.com/matrixorigin/matrixone/pkg/pb/task" 36 "github.com/matrixorigin/matrixone/pkg/taskservice" 37 "github.com/matrixorigin/matrixone/pkg/testutil" 38 ) 39 40 func TestIDAllocatorDefaultState(t *testing.T) { 41 alloc := newIDAllocator() 42 assert.Equal(t, uint64(0), alloc.Capacity()) 43 v, ok := alloc.Next() 44 assert.False(t, ok) 45 assert.Equal(t, uint64(0), v) 46 } 47 48 func TestIDAllocatorCapacity(t *testing.T) { 49 tests := []struct { 50 next uint64 51 last uint64 52 capacity uint64 53 }{ 54 {1, 1, 1}, 55 {2, 1, 0}, 56 {1, 2, 2}, 57 {100, 200, 101}, 58 } 59 60 for _, tt := range tests { 61 alloc := idAllocator{nextID: tt.next, lastID: tt.last} 62 assert.Equal(t, tt.capacity, alloc.Capacity()) 63 } 64 } 65 66 func TestIDAllocatorSet(t *testing.T) { 67 alloc := idAllocator{nextID: 100, lastID: 200} 68 alloc.Set(hakeeper.K8SIDRangeEnd, hakeeper.K8SIDRangeEnd+100) 69 expected := idAllocator{ 70 nextID: hakeeper.K8SIDRangeEnd, 71 lastID: hakeeper.K8SIDRangeEnd + 100, 72 } 73 assert.Equal(t, expected, alloc) 74 } 75 76 func TestIDAllocatorRejectInvalidSetInput(t *testing.T) { 77 alloc := idAllocator{nextID: 100, lastID: 200} 78 defer func() { 79 if r := recover(); r == nil { 80 t.Fatalf("failed to trigger panic") 81 } 82 }() 83 alloc.Set(300, 400) 84 } 85 86 func TestIDAllocatorNext(t *testing.T) { 87 tests := []struct { 88 next uint64 89 last uint64 90 capacity uint64 91 }{ 92 {1, 1, 1}, 93 {2, 1, 0}, 94 {1, 2, 2}, 95 {100, 200, 101}, 96 } 97 98 for _, tt := range tests { 99 expected := tt.next 100 alloc := idAllocator{nextID: tt.next, lastID: tt.last} 101 for { 102 hasID := alloc.Capacity() != 0 103 v, ok := alloc.Next() 104 assert.Equal(t, hasID, ok) 105 if hasID { 106 assert.Equal(t, expected, v) 107 expected++ 108 } else { 109 assert.Equal(t, uint64(0), v) 110 break 111 } 112 } 113 } 114 } 115 116 func TestHandleBootstrapFailure(t *testing.T) { 117 defer func() { 118 if r := recover(); r == nil { 119 t.Fatalf("failed to trigger panic") 120 } 121 }() 122 s := store{} 123 s.handleBootstrapFailure() 124 } 125 126 func runHAKeeperStoreTest(t *testing.T, startLogReplica bool, fn func(*testing.T, *store)) { 127 defer leaktest.AfterTest(t)() 128 cfg := getStoreTestConfig() 129 defer vfs.ReportLeakedFD(cfg.FS, t) 130 store, err := getTestStore(cfg, startLogReplica, nil) 131 assert.NoError(t, err) 132 defer func() { 133 assert.NoError(t, store.close()) 134 }() 135 peers := make(map[uint64]dragonboat.Target) 136 peers[1] = store.id() 137 assert.NoError(t, store.startHAKeeperReplica(1, peers, false)) 138 fn(t, store) 139 } 140 141 func runHakeeperTaskServiceTest(t *testing.T, fn func(*testing.T, *store, taskservice.TaskService)) { 142 defer leaktest.AfterTest(t)() 143 cfg := getStoreTestConfig() 144 cfg.HAKeeperConfig.CNStoreTimeout.Duration = 5 * time.Second 145 defer vfs.ReportLeakedFD(cfg.FS, t) 146 147 taskService := taskservice.NewTaskService(runtime.DefaultRuntime(), taskservice.NewMemTaskStorage()) 148 defer taskService.StopScheduleCronTask() 149 150 store, err := getTestStore(cfg, false, taskService) 151 assert.NoError(t, err) 152 defer func() { 153 assert.NoError(t, store.close()) 154 }() 155 peers := make(map[uint64]dragonboat.Target) 156 peers[1] = store.id() 157 assert.NoError(t, store.startHAKeeperReplica(1, peers, false)) 158 fn(t, store, taskService) 159 } 160 161 func runHAKeeperClusterTest(t *testing.T, fn func(*testing.T, []*Service)) { 162 defer leaktest.AfterTest(t)() 163 cfg1 := Config{ 164 UUID: uuid.New().String(), 165 FS: vfs.NewStrictMem(), 166 DeploymentID: 1, 167 RTTMillisecond: 5, 168 DataDir: "data-1", 169 ServiceAddress: "127.0.0.1:9002", 170 RaftAddress: "127.0.0.1:9000", 171 GossipAddress: "127.0.0.1:9001", 172 GossipSeedAddresses: []string{"127.0.0.1:9011", "127.0.0.1:9021", "127.0.0.1:9031"}, 173 DisableWorkers: true, 174 } 175 cfg1.HAKeeperConfig.TickPerSecond = 10 176 cfg1.HAKeeperConfig.LogStoreTimeout.Duration = 5 * time.Second 177 cfg1.HAKeeperConfig.DNStoreTimeout.Duration = 10 * time.Second 178 cfg1.HAKeeperConfig.CNStoreTimeout.Duration = 5 * time.Second 179 cfg2 := Config{ 180 UUID: uuid.New().String(), 181 FS: vfs.NewStrictMem(), 182 DeploymentID: 1, 183 RTTMillisecond: 5, 184 DataDir: "data-2", 185 ServiceAddress: "127.0.0.1:9012", 186 RaftAddress: "127.0.0.1:9010", 187 GossipAddress: "127.0.0.1:9011", 188 GossipSeedAddresses: []string{"127.0.0.1:9001", "127.0.0.1:9021", "127.0.0.1:9031"}, 189 DisableWorkers: true, 190 } 191 cfg2.HAKeeperConfig.TickPerSecond = 10 192 cfg2.HAKeeperConfig.LogStoreTimeout.Duration = 5 * time.Second 193 cfg2.HAKeeperConfig.DNStoreTimeout.Duration = 10 * time.Second 194 cfg2.HAKeeperConfig.CNStoreTimeout.Duration = 5 * time.Second 195 cfg3 := Config{ 196 UUID: uuid.New().String(), 197 FS: vfs.NewStrictMem(), 198 DeploymentID: 1, 199 RTTMillisecond: 5, 200 DataDir: "data-3", 201 ServiceAddress: "127.0.0.1:9022", 202 RaftAddress: "127.0.0.1:9020", 203 GossipAddress: "127.0.0.1:9021", 204 GossipSeedAddresses: []string{"127.0.0.1:9001", "127.0.0.1:9011", "127.0.0.1:9031"}, 205 DisableWorkers: true, 206 } 207 cfg3.HAKeeperConfig.TickPerSecond = 10 208 cfg3.HAKeeperConfig.LogStoreTimeout.Duration = 5 * time.Second 209 cfg3.HAKeeperConfig.DNStoreTimeout.Duration = 10 * time.Second 210 cfg3.HAKeeperConfig.CNStoreTimeout.Duration = 5 * time.Second 211 cfg4 := Config{ 212 UUID: uuid.New().String(), 213 FS: vfs.NewStrictMem(), 214 DeploymentID: 1, 215 RTTMillisecond: 5, 216 DataDir: "data-4", 217 ServiceAddress: "127.0.0.1:9032", 218 RaftAddress: "127.0.0.1:9030", 219 GossipAddress: "127.0.0.1:9031", 220 GossipSeedAddresses: []string{"127.0.0.1:9001", "127.0.0.1:9011", "127.0.0.1:9021"}, 221 DisableWorkers: true, 222 } 223 cfg4.HAKeeperConfig.TickPerSecond = 10 224 cfg4.HAKeeperConfig.LogStoreTimeout.Duration = 5 * time.Second 225 cfg4.HAKeeperConfig.DNStoreTimeout.Duration = 10 * time.Second 226 cfg4.HAKeeperConfig.CNStoreTimeout.Duration = 5 * time.Second 227 cfg1.Fill() 228 service1, err := NewService(cfg1, 229 testutil.NewFS(), 230 WithBackendFilter(func(msg morpc.Message, backendAddr string) bool { 231 return true 232 }), 233 ) 234 require.NoError(t, err) 235 defer func() { 236 assert.NoError(t, service1.Close()) 237 }() 238 cfg2.Fill() 239 service2, err := NewService(cfg2, 240 testutil.NewFS(), 241 WithBackendFilter(func(msg morpc.Message, backendAddr string) bool { 242 return true 243 }), 244 ) 245 require.NoError(t, err) 246 defer func() { 247 assert.NoError(t, service2.Close()) 248 }() 249 cfg3.Fill() 250 service3, err := NewService(cfg3, 251 testutil.NewFS(), 252 WithBackendFilter(func(msg morpc.Message, backendAddr string) bool { 253 return true 254 }), 255 ) 256 require.NoError(t, err) 257 defer func() { 258 assert.NoError(t, service3.Close()) 259 }() 260 cfg4.Fill() 261 service4, err := NewService(cfg4, 262 testutil.NewFS(), 263 WithBackendFilter(func(msg morpc.Message, backendAddr string) bool { 264 return true 265 }), 266 ) 267 require.NoError(t, err) 268 defer func() { 269 assert.NoError(t, service4.Close()) 270 }() 271 272 peers := make(map[uint64]dragonboat.Target) 273 peers[1] = service1.ID() 274 peers[2] = service2.ID() 275 peers[3] = service3.ID() 276 assert.NoError(t, service1.store.startHAKeeperReplica(1, peers, false)) 277 assert.NoError(t, service2.store.startHAKeeperReplica(2, peers, false)) 278 assert.NoError(t, service3.store.startHAKeeperReplica(3, peers, false)) 279 fn(t, []*Service{service1, service2, service3, service4}) 280 } 281 282 func TestHAKeeperCanBootstrapAndRepairShards(t *testing.T) { 283 fn := func(t *testing.T, services []*Service) { 284 // bootstrap the cluster, 1 DN 1 Log shard, Log and HAKeeper have 285 // 3 replicas 286 store1 := services[0].store 287 state, err := store1.getCheckerState() 288 require.NoError(t, err) 289 assert.Equal(t, pb.HAKeeperCreated, state.State) 290 require.NoError(t, store1.setInitialClusterInfo(1, 1, 3)) 291 state, err = store1.getCheckerState() 292 require.NoError(t, err) 293 assert.Equal(t, pb.HAKeeperBootstrapping, state.State) 294 295 sendHeartbeat := func(ss []*Service) { 296 for _, s := range ss { 297 done := false 298 for i := 0; i < 10; i++ { 299 m := s.store.getHeartbeatMessage() 300 ctx, cancel := context.WithTimeout(context.Background(), time.Second) 301 defer cancel() 302 _, err := s.store.addLogStoreHeartbeat(ctx, m) 303 if err == dragonboat.ErrTimeout { 304 time.Sleep(100 * time.Millisecond) 305 } else { 306 if err == nil { 307 done = true 308 break 309 } else { 310 t.Fatalf("failed to add heartbeat %v", err) 311 } 312 } 313 } 314 if !done { 315 t.Fatalf("failed to add heartbeat after 10 retries") 316 } 317 } 318 } 319 sendHeartbeat(services[:3]) 320 321 // fake a DN store 322 dnMsg := pb.DNStoreHeartbeat{ 323 UUID: uuid.New().String(), 324 Shards: make([]pb.DNShardInfo, 0), 325 } 326 ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second) 327 defer cancel() 328 _, err = services[0].store.addDNStoreHeartbeat(ctx, dnMsg) 329 require.NoError(t, err) 330 331 // find out the leader HAKeeper store as we need the term value 332 var term uint64 333 var leaderStore *store 334 for _, s := range services[:3] { 335 isLeader, curTerm, err := s.store.isLeaderHAKeeper() 336 require.NoError(t, err) 337 if isLeader { 338 term = curTerm 339 leaderStore = s.store 340 break 341 } 342 } 343 require.NotNil(t, leaderStore) 344 require.True(t, term > 0) 345 346 // bootstrap the cluster 347 state, err = leaderStore.getCheckerState() 348 require.NoError(t, err) 349 leaderStore.bootstrap(term, state) 350 351 state, err = leaderStore.getCheckerState() 352 require.NoError(t, err) 353 assert.Equal(t, pb.HAKeeperBootstrapCommandsReceived, state.State) 354 assert.Equal(t, uint64(checkBootstrapCycles), leaderStore.bootstrapCheckCycles) 355 require.NotNil(t, leaderStore.bootstrapMgr) 356 assert.False(t, leaderStore.bootstrapMgr.CheckBootstrap(state.LogState)) 357 358 // get and apply all bootstrap schedule commands 359 for _, s := range services[:3] { 360 cb, err := s.store.getCommandBatch(ctx, s.store.id()) 361 require.NoError(t, err) 362 if len(cb.Commands) > 0 { 363 s.handleStartReplica(cb.Commands[0]) 364 } 365 } 366 367 // check bootstrap can be completed 368 for i := 0; i < 100; i++ { 369 sendHeartbeat(services[:3]) 370 state, err = leaderStore.getCheckerState() 371 require.NoError(t, err) 372 leaderStore.checkBootstrap(state) 373 374 state, err = leaderStore.getCheckerState() 375 require.NoError(t, err) 376 if state.State != pb.HAKeeperRunning { 377 // FIXME: why wait here? 378 time.Sleep(50 * time.Millisecond) 379 } else { 380 break 381 } 382 if i == 99 { 383 t.Fatalf("failed to complete bootstrap") 384 } 385 } 386 387 // get the DN bootstrap command, it contains DN shard and replica ID 388 cb, err := leaderStore.getCommandBatch(ctx, dnMsg.UUID) 389 require.NoError(t, err) 390 require.Equal(t, 1, len(cb.Commands)) 391 cmd := cb.Commands[0] 392 assert.True(t, cmd.Bootstrapping) 393 assert.Equal(t, pb.DNService, cmd.ServiceType) 394 dnShardInfo := pb.DNShardInfo{ 395 ShardID: cmd.ConfigChange.Replica.ShardID, 396 ReplicaID: cmd.ConfigChange.Replica.ReplicaID, 397 } 398 dnMsg.Shards = append(dnMsg.Shards, dnShardInfo) 399 // as if DN is running 400 _, err = services[0].store.addDNStoreHeartbeat(ctx, dnMsg) 401 require.NoError(t, err) 402 // fake a free DN store 403 dnMsg2 := pb.DNStoreHeartbeat{ 404 UUID: uuid.New().String(), 405 Shards: make([]pb.DNShardInfo, 0), 406 } 407 _, err = services[0].store.addDNStoreHeartbeat(ctx, dnMsg2) 408 require.NoError(t, err) 409 410 // stop store 1 411 require.NoError(t, services[0].Close()) 412 // no service.Close can be repeatedly called 413 services[0].store = nil 414 services = services[1:] 415 416 // wait for HAKeeper to repair the Log & HAKeeper shards 417 dnRepaired := false 418 for i := 0; i < 5000; i++ { 419 testLogger.Info(fmt.Sprintf("iteration %d", i)) 420 tn := func() (bool, error) { 421 ctx, cancel := context.WithTimeout(context.Background(), time.Second) 422 defer cancel() 423 m := services[0].store.getHeartbeatMessage() 424 if cb, err := services[0].store.addLogStoreHeartbeat(ctx, m); err != nil { 425 return false, err 426 } else { 427 services[0].handleCommands(cb.Commands) 428 } 429 m = services[1].store.getHeartbeatMessage() 430 if cb, err := services[1].store.addLogStoreHeartbeat(ctx, m); err != nil { 431 return false, err 432 } else { 433 services[1].handleCommands(cb.Commands) 434 } 435 m = services[2].store.getHeartbeatMessage() 436 if cb, err := services[0].store.addLogStoreHeartbeat(ctx, m); err != nil { 437 return false, err 438 } else { 439 services[2].handleCommands(cb.Commands) 440 } 441 if _, err := services[0].store.addDNStoreHeartbeat(ctx, dnMsg2); err != nil { 442 return false, err 443 } 444 445 for _, s := range services { 446 if hasShard(s.store, 0) { 447 s.store.hakeeperTick() 448 s.store.hakeeperCheck() 449 } 450 451 cb, err = services[0].store.getCommandBatch(ctx, dnMsg2.UUID) 452 if err != nil { 453 return false, err 454 } 455 if len(cb.Commands) > 0 { 456 cmd := cb.Commands[0] 457 if cmd.ServiceType == pb.DNService { 458 if cmd.ConfigChange != nil && cmd.ConfigChange.Replica.ShardID == dnShardInfo.ShardID && 459 cmd.ConfigChange.Replica.ReplicaID > dnShardInfo.ReplicaID { 460 dnRepaired = true 461 } 462 } 463 } 464 } 465 466 logRepaired := true 467 for _, s := range services { 468 if !hasShard(s.store, 0) || !hasShard(s.store, 1) { 469 logRepaired = false 470 break 471 } 472 } 473 testLogger.Info(fmt.Sprintf("dnRepaired %t, logRepaired %t", dnRepaired, logRepaired)) 474 if !logRepaired || !dnRepaired { 475 return false, nil 476 } else { 477 testLogger.Info(fmt.Sprintf("repair completed, i: %d", i)) 478 return true, nil 479 } 480 } 481 completed, err := tn() 482 if err != nil && err != dragonboat.ErrTimeout && 483 err != dragonboat.ErrInvalidDeadline && err != dragonboat.ErrTimeoutTooSmall { 484 t.Fatalf("unexpected error %v", err) 485 } 486 if completed { 487 for _, s := range services[:3] { 488 _ = s.task.holder.Close() 489 } 490 return 491 } 492 time.Sleep(5 * time.Millisecond) 493 } 494 t.Fatalf("failed to repair shards") 495 } 496 runHAKeeperClusterTest(t, fn) 497 } 498 499 func TestGetCheckerState(t *testing.T) { 500 fn := func(t *testing.T, store *store) { 501 state, err := store.getCheckerState() 502 require.NoError(t, err) 503 assert.Equal(t, pb.HAKeeperCreated, state.State) 504 } 505 runHAKeeperStoreTest(t, false, fn) 506 } 507 508 func TestSetInitialClusterInfo(t *testing.T) { 509 fn := func(t *testing.T, store *store) { 510 state, err := store.getCheckerState() 511 require.NoError(t, err) 512 assert.Equal(t, pb.HAKeeperCreated, state.State) 513 require.NoError(t, store.setInitialClusterInfo(1, 1, 1)) 514 state, err = store.getCheckerState() 515 require.NoError(t, err) 516 assert.Equal(t, pb.HAKeeperBootstrapping, state.State) 517 } 518 runHAKeeperStoreTest(t, false, fn) 519 } 520 521 func TestFailedBootstrap(t *testing.T) { 522 testBootstrap(t, true) 523 } 524 525 func TestBootstrap(t *testing.T) { 526 testBootstrap(t, false) 527 } 528 529 func testBootstrap(t *testing.T, fail bool) { 530 fn := func(t *testing.T, store *store) { 531 state, err := store.getCheckerState() 532 require.NoError(t, err) 533 assert.Equal(t, pb.HAKeeperCreated, state.State) 534 require.NoError(t, store.setInitialClusterInfo(1, 1, 1)) 535 state, err = store.getCheckerState() 536 require.NoError(t, err) 537 assert.Equal(t, pb.HAKeeperBootstrapping, state.State) 538 m := store.getHeartbeatMessage() 539 ctx, cancel := context.WithTimeout(context.Background(), time.Second) 540 defer cancel() 541 _, err = store.addLogStoreHeartbeat(ctx, m) 542 assert.NoError(t, err) 543 544 dnMsg := pb.DNStoreHeartbeat{ 545 UUID: uuid.New().String(), 546 Shards: make([]pb.DNShardInfo, 0), 547 } 548 _, err = store.addDNStoreHeartbeat(ctx, dnMsg) 549 assert.NoError(t, err) 550 551 _, term, err := store.isLeaderHAKeeper() 552 require.NoError(t, err) 553 554 state, err = store.getCheckerState() 555 require.NoError(t, err) 556 store.bootstrap(term, state) 557 558 state, err = store.getCheckerState() 559 require.NoError(t, err) 560 assert.Equal(t, pb.HAKeeperBootstrapCommandsReceived, state.State) 561 assert.Equal(t, uint64(checkBootstrapCycles), store.bootstrapCheckCycles) 562 require.NotNil(t, store.bootstrapMgr) 563 assert.False(t, store.bootstrapMgr.CheckBootstrap(state.LogState)) 564 565 if fail { 566 // keep checking, bootstrap will eventually be set as failed 567 for i := 0; i <= checkBootstrapCycles; i++ { 568 store.checkBootstrap(state) 569 } 570 571 state, err = store.getCheckerState() 572 require.NoError(t, err) 573 assert.Equal(t, pb.HAKeeperBootstrapFailed, state.State) 574 } else { 575 cb, err := store.getCommandBatch(ctx, dnMsg.UUID) 576 require.NoError(t, err) 577 require.Equal(t, 1, len(cb.Commands)) 578 assert.True(t, cb.Commands[0].Bootstrapping) 579 assert.Equal(t, pb.DNService, cb.Commands[0].ServiceType) 580 assert.True(t, cb.Commands[0].ConfigChange.Replica.ReplicaID > 0) 581 582 cb, err = store.getCommandBatch(ctx, store.id()) 583 require.NoError(t, err) 584 require.Equal(t, 1, len(cb.Commands)) 585 assert.True(t, cb.Commands[0].Bootstrapping) 586 service := &Service{store: store} 587 service.handleStartReplica(cb.Commands[0]) 588 589 for i := 0; i < 100; i++ { 590 ctx, cancel := context.WithTimeout(context.Background(), time.Second) 591 defer cancel() 592 m := store.getHeartbeatMessage() 593 _, err = store.addLogStoreHeartbeat(ctx, m) 594 assert.NoError(t, err) 595 596 state, err = store.getCheckerState() 597 require.NoError(t, err) 598 store.checkBootstrap(state) 599 600 state, err = store.getCheckerState() 601 require.NoError(t, err) 602 if state.State != pb.HAKeeperRunning { 603 time.Sleep(50 * time.Millisecond) 604 } else { 605 return 606 } 607 if i == 2999 { 608 t.Fatalf("failed to complete bootstrap") 609 } 610 } 611 } 612 } 613 runHAKeeperStoreTest(t, false, fn) 614 } 615 616 func TestTaskSchedulerCanScheduleTasksToCNs(t *testing.T) { 617 fn := func(t *testing.T, store *store, taskService taskservice.TaskService) { 618 state, err := store.getCheckerState() 619 require.NoError(t, err) 620 assert.Equal(t, pb.HAKeeperCreated, state.State) 621 require.NoError(t, store.setInitialClusterInfo(1, 1, 1)) 622 state, err = store.getCheckerState() 623 require.NoError(t, err) 624 assert.Equal(t, pb.HAKeeperBootstrapping, state.State) 625 m := store.getHeartbeatMessage() 626 ctx, cancel := context.WithTimeout(context.Background(), time.Second) 627 defer cancel() 628 _, err = store.addLogStoreHeartbeat(ctx, m) 629 assert.NoError(t, err) 630 631 _, term, err := store.isLeaderHAKeeper() 632 require.NoError(t, err) 633 634 state, err = store.getCheckerState() 635 require.NoError(t, err) 636 store.bootstrap(term, state) 637 638 state, err = store.getCheckerState() 639 require.NoError(t, err) 640 assert.Equal(t, pb.HAKeeperBootstrapCommandsReceived, state.State) 641 assert.Equal(t, uint64(checkBootstrapCycles), store.bootstrapCheckCycles) 642 require.NotNil(t, store.bootstrapMgr) 643 assert.False(t, store.bootstrapMgr.CheckBootstrap(state.LogState)) 644 645 cb, err := store.getCommandBatch(ctx, store.id()) 646 require.NoError(t, err) 647 require.Equal(t, 1, len(cb.Commands)) 648 assert.True(t, cb.Commands[0].Bootstrapping) 649 service := &Service{store: store} 650 service.handleStartReplica(cb.Commands[0]) 651 652 for i := 0; i < 100; i++ { 653 ctx, cancel := context.WithTimeout(context.Background(), time.Second) 654 defer cancel() 655 m := store.getHeartbeatMessage() 656 _, err = store.addLogStoreHeartbeat(ctx, m) 657 assert.NoError(t, err) 658 659 state, err = store.getCheckerState() 660 require.NoError(t, err) 661 store.checkBootstrap(state) 662 663 state, err = store.getCheckerState() 664 require.NoError(t, err) 665 if state.State != pb.HAKeeperRunning { 666 time.Sleep(50 * time.Millisecond) 667 } else { 668 break 669 } 670 if i == 2999 { 671 t.Fatalf("failed to complete bootstrap") 672 } 673 } 674 675 cnUUID1 := uuid.New().String() 676 cnMsg1 := pb.CNStoreHeartbeat{UUID: cnUUID1} 677 _, err = store.addCNStoreHeartbeat(ctx, cnMsg1) 678 assert.NoError(t, err) 679 err = taskService.Create(ctx, task.TaskMetadata{ID: "a"}) 680 assert.NoError(t, err) 681 state, err = store.getCheckerState() 682 require.NoError(t, err) 683 tasks, err := taskService.QueryTask(ctx, taskservice.WithTaskRunnerCond(taskservice.EQ, cnUUID1)) 684 assert.NoError(t, err) 685 assert.Equal(t, 0, len(tasks)) 686 store.taskSchedule(state) 687 // update state 688 state, err = store.getCheckerState() 689 require.NoError(t, err) 690 store.taskSchedule(state) 691 tasks, err = taskService.QueryTask(ctx, taskservice.WithTaskRunnerCond(taskservice.EQ, cnUUID1)) 692 assert.NoError(t, err) 693 assert.Equal(t, 1, len(tasks)) 694 695 cnUUID2 := uuid.New().String() 696 cnMsg2 := pb.CNStoreHeartbeat{UUID: cnUUID2} 697 _, err = store.addCNStoreHeartbeat(ctx, cnMsg2) 698 assert.NoError(t, err) 699 err = taskService.Create(ctx, task.TaskMetadata{ID: "b"}) 700 assert.NoError(t, err) 701 state, err = store.getCheckerState() 702 require.NoError(t, err) 703 tasks, err = taskService.QueryTask(ctx, taskservice.WithTaskRunnerCond(taskservice.EQ, cnUUID2)) 704 assert.NoError(t, err) 705 assert.Equal(t, 0, len(tasks)) 706 store.taskSchedule(state) 707 tasks, err = taskService.QueryTask(ctx, taskservice.WithTaskRunnerCond(taskservice.EQ, cnUUID2)) 708 assert.NoError(t, err) 709 assert.Equal(t, 1, len(tasks)) 710 } 711 runHakeeperTaskServiceTest(t, fn) 712 } 713 714 func TestTaskSchedulerCanReScheduleExpiredTasks(t *testing.T) { 715 fn := func(t *testing.T, store *store, taskService taskservice.TaskService) { 716 state, err := store.getCheckerState() 717 require.NoError(t, err) 718 assert.Equal(t, pb.HAKeeperCreated, state.State) 719 require.NoError(t, store.setInitialClusterInfo(1, 1, 1)) 720 state, err = store.getCheckerState() 721 require.NoError(t, err) 722 assert.Equal(t, pb.HAKeeperBootstrapping, state.State) 723 m := store.getHeartbeatMessage() 724 ctx, cancel := context.WithTimeout(context.Background(), time.Second) 725 defer cancel() 726 _, err = store.addLogStoreHeartbeat(ctx, m) 727 assert.NoError(t, err) 728 729 _, term, err := store.isLeaderHAKeeper() 730 require.NoError(t, err) 731 732 state, err = store.getCheckerState() 733 require.NoError(t, err) 734 store.bootstrap(term, state) 735 736 state, err = store.getCheckerState() 737 require.NoError(t, err) 738 assert.Equal(t, pb.HAKeeperBootstrapCommandsReceived, state.State) 739 assert.Equal(t, uint64(checkBootstrapCycles), store.bootstrapCheckCycles) 740 require.NotNil(t, store.bootstrapMgr) 741 assert.False(t, store.bootstrapMgr.CheckBootstrap(state.LogState)) 742 743 cb, err := store.getCommandBatch(ctx, store.id()) 744 require.NoError(t, err) 745 require.Equal(t, 1, len(cb.Commands)) 746 assert.True(t, cb.Commands[0].Bootstrapping) 747 service := &Service{store: store} 748 service.handleStartReplica(cb.Commands[0]) 749 750 for i := 0; i < 100; i++ { 751 ctx, cancel := context.WithTimeout(context.Background(), time.Second) 752 defer cancel() 753 m := store.getHeartbeatMessage() 754 _, err = store.addLogStoreHeartbeat(ctx, m) 755 assert.NoError(t, err) 756 757 state, err = store.getCheckerState() 758 require.NoError(t, err) 759 store.checkBootstrap(state) 760 761 state, err = store.getCheckerState() 762 require.NoError(t, err) 763 if state.State != pb.HAKeeperRunning { 764 time.Sleep(50 * time.Millisecond) 765 } else { 766 break 767 } 768 if i == 2999 { 769 t.Fatalf("failed to complete bootstrap") 770 } 771 } 772 773 cnUUID1 := uuid.New().String() 774 cnMsg1 := pb.CNStoreHeartbeat{UUID: cnUUID1} 775 _, err = store.addCNStoreHeartbeat(ctx, cnMsg1) 776 assert.NoError(t, err) 777 err = taskService.Create(ctx, task.TaskMetadata{ID: "a"}) 778 assert.NoError(t, err) 779 state, err = store.getCheckerState() 780 require.NoError(t, err) 781 tasks, err := taskService.QueryTask(ctx, taskservice.WithTaskRunnerCond(taskservice.EQ, cnUUID1)) 782 assert.NoError(t, err) 783 assert.Equal(t, 0, len(tasks)) 784 store.taskSchedule(state) 785 // update state 786 state, err = store.getCheckerState() 787 require.NoError(t, err) 788 store.taskSchedule(state) 789 tasks, err = taskService.QueryTask(ctx, taskservice.WithTaskRunnerCond(taskservice.EQ, cnUUID1)) 790 assert.NoError(t, err) 791 assert.Equal(t, 1, len(tasks)) 792 793 cnUUID2 := uuid.New().String() 794 for i := 0; i < 1000; i++ { 795 testLogger.Info(fmt.Sprintf("iteration %d", i)) 796 tn := func() bool { 797 ctx, cancel := context.WithTimeout(context.Background(), time.Second) 798 defer cancel() 799 cnMsg2 := pb.CNStoreHeartbeat{UUID: cnUUID2} 800 _, err = store.addCNStoreHeartbeat(ctx, cnMsg2) 801 assert.NoError(t, err) 802 state, err = store.getCheckerState() 803 require.NoError(t, err) 804 store.taskSchedule(state) 805 tasks, err = taskService.QueryTask(ctx, taskservice.WithTaskRunnerCond(taskservice.EQ, cnUUID2)) 806 assert.NoError(t, err) 807 if len(tasks) == 0 { 808 testLogger.Info("no task found") 809 time.Sleep(50 * time.Millisecond) 810 } else { 811 tasks, err = taskService.QueryTask(ctx, taskservice.WithTaskRunnerCond(taskservice.EQ, cnUUID1)) 812 assert.Equal(t, 0, len(tasks)) 813 return true 814 } 815 return false 816 } 817 completed := tn() 818 if completed { 819 store.taskScheduler.StopScheduleCronTask() 820 return 821 } 822 time.Sleep(100 * time.Millisecond) 823 } 824 t.Fatalf("failed to reschedule expired tasks") 825 } 826 runHakeeperTaskServiceTest(t, fn) 827 } 828 829 func TestGetTaskTableUserFromEnv(t *testing.T) { 830 os.Setenv(moAdminUser, "root") 831 user, ok := getTaskTableUserFromEnv() 832 require.False(t, ok) 833 require.Equal(t, pb.TaskTableUser{}, user) 834 835 os.Setenv(moAdminPassword, "") 836 user, ok = getTaskTableUserFromEnv() 837 require.False(t, ok) 838 require.Equal(t, pb.TaskTableUser{}, user) 839 840 os.Setenv(moAdminPassword, "root") 841 user, ok = getTaskTableUserFromEnv() 842 require.True(t, ok) 843 require.Equal(t, pb.TaskTableUser{Username: "root", Password: "root"}, user) 844 }