github.com/matrixorigin/matrixone@v0.7.0/pkg/logservice/service_test.go (about) 1 // Copyright 2021 - 2022 Matrix Origin 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package logservice 16 17 import ( 18 "context" 19 "fmt" 20 "runtime/debug" 21 "sync" 22 "testing" 23 "time" 24 25 "github.com/google/uuid" 26 "github.com/lni/dragonboat/v4" 27 "github.com/lni/goutils/leaktest" 28 "github.com/lni/vfs" 29 "github.com/matrixorigin/matrixone/pkg/common/moerr" 30 "github.com/matrixorigin/matrixone/pkg/common/morpc" 31 hapkg "github.com/matrixorigin/matrixone/pkg/hakeeper" 32 pb "github.com/matrixorigin/matrixone/pkg/pb/logservice" 33 "github.com/matrixorigin/matrixone/pkg/testutil" 34 "github.com/stretchr/testify/assert" 35 "github.com/stretchr/testify/require" 36 ) 37 38 const ( 39 testServiceAddress = "127.0.0.1:9000" 40 testGossipAddress = "127.0.0.1:9010" 41 dummyGossipSeedAddress = "127.0.0.1:9100" 42 testServerMaxMsgSize = 1000 43 ) 44 45 func getServiceTestConfig() Config { 46 c := Config{ 47 UUID: uuid.New().String(), 48 RTTMillisecond: 10, 49 GossipAddress: testGossipAddress, 50 GossipListenAddress: testGossipAddress, 51 GossipSeedAddresses: []string{testGossipAddress, dummyGossipSeedAddress}, 52 DeploymentID: 1, 53 FS: vfs.NewStrictMem(), 54 ServiceListenAddress: testServiceAddress, 55 ServiceAddress: testServiceAddress, 56 DisableWorkers: true, 57 UseTeeLogDB: true, 58 } 59 c.RPC.MaxMessageSize = testServerMaxMsgSize 60 c.Fill() 61 return c 62 } 63 64 func runServiceTest(t *testing.T, 65 hakeeper bool, startReplica bool, fn func(*testing.T, *Service)) { 66 defer leaktest.AfterTest(t)() 67 cfg := getServiceTestConfig() 68 defer vfs.ReportLeakedFD(cfg.FS, t) 69 service, err := NewService(cfg, 70 testutil.NewFS(), 71 WithBackendFilter(func(msg morpc.Message, backendAddr string) bool { 72 return true 73 }), 74 ) 75 require.NoError(t, err) 76 defer func() { 77 assert.NoError(t, service.Close()) 78 }() 79 80 if startReplica { 81 shardID := hapkg.DefaultHAKeeperShardID 82 peers := make(map[uint64]dragonboat.Target) 83 peers[1] = service.ID() 84 if hakeeper { 85 require.NoError(t, service.store.startHAKeeperReplica(1, peers, false)) 86 } else { 87 shardID = 1 88 require.NoError(t, service.store.startReplica(1, 1, peers, false)) 89 } 90 91 // wait for leader to be elected 92 done := false 93 for i := 0; i < 1000; i++ { 94 _, _, ok, err := service.store.nh.GetLeaderID(shardID) 95 require.NoError(t, err) 96 if ok { 97 done = true 98 break 99 } 100 time.Sleep(10 * time.Millisecond) 101 } 102 require.True(t, done) 103 } 104 105 fn(t, service) 106 } 107 108 func TestNewService(t *testing.T) { 109 defer leaktest.AfterTest(t)() 110 cfg := getServiceTestConfig() 111 defer vfs.ReportLeakedFD(cfg.FS, t) 112 service, err := NewService(cfg, 113 testutil.NewFS(), 114 WithBackendFilter(func(msg morpc.Message, backendAddr string) bool { 115 return true 116 }), 117 ) 118 require.NoError(t, err) 119 assert.NoError(t, service.Close()) 120 } 121 122 func TestServiceConnect(t *testing.T) { 123 fn := func(t *testing.T, s *Service) { 124 ctx, cancel := context.WithTimeout(context.Background(), time.Second) 125 defer cancel() 126 127 req := pb.Request{ 128 Method: pb.CONNECT, 129 LogRequest: pb.LogRequest{ 130 ShardID: 1, 131 DNID: 100, 132 }, 133 } 134 resp := s.handleConnect(ctx, req) 135 assert.Equal(t, uint32(moerr.Ok), resp.ErrorCode) 136 } 137 runServiceTest(t, false, true, fn) 138 } 139 140 func TestServiceConnectTimeout(t *testing.T) { 141 fn := func(t *testing.T, s *Service) { 142 ctx, cancel := context.WithTimeout(context.Background(), time.Millisecond) 143 defer cancel() 144 145 req := pb.Request{ 146 Method: pb.CONNECT, 147 LogRequest: pb.LogRequest{ 148 ShardID: 1, 149 DNID: 100, 150 }, 151 } 152 resp := s.handleConnect(ctx, req) 153 assert.Equal(t, uint32(moerr.ErrDragonboatTimeout), resp.ErrorCode) 154 } 155 runServiceTest(t, false, true, fn) 156 } 157 158 func TestServiceConnectRO(t *testing.T) { 159 fn := func(t *testing.T, s *Service) { 160 ctx, cancel := context.WithTimeout(context.Background(), time.Second) 161 defer cancel() 162 163 req := pb.Request{ 164 Method: pb.CONNECT_RO, 165 LogRequest: pb.LogRequest{ 166 ShardID: 1, 167 DNID: 100, 168 }, 169 } 170 resp := s.handleConnect(ctx, req) 171 assert.Equal(t, uint32(moerr.Ok), resp.ErrorCode) 172 } 173 runServiceTest(t, false, true, fn) 174 } 175 176 func getTestAppendCmd(id uint64, data []byte) []byte { 177 cmd := make([]byte, len(data)+headerSize+8) 178 binaryEnc.PutUint32(cmd, uint32(pb.UserEntryUpdate)) 179 binaryEnc.PutUint64(cmd[headerSize:], id) 180 copy(cmd[headerSize+8:], data) 181 return cmd 182 } 183 184 func TestServiceHandleLogHeartbeat(t *testing.T) { 185 fn := func(t *testing.T, s *Service) { 186 ctx, cancel := context.WithTimeout(context.Background(), time.Second) 187 defer cancel() 188 189 req := pb.Request{ 190 Method: pb.LOG_HEARTBEAT, 191 LogHeartbeat: &pb.LogStoreHeartbeat{ 192 UUID: "uuid1", 193 }, 194 } 195 sc1 := pb.ScheduleCommand{ 196 UUID: "uuid1", 197 ConfigChange: &pb.ConfigChange{ 198 Replica: pb.Replica{ 199 ShardID: 1, 200 }, 201 }, 202 } 203 sc2 := pb.ScheduleCommand{ 204 UUID: "uuid2", 205 ConfigChange: &pb.ConfigChange{ 206 Replica: pb.Replica{ 207 ShardID: 2, 208 }, 209 }, 210 } 211 sc3 := pb.ScheduleCommand{ 212 UUID: "uuid1", 213 ConfigChange: &pb.ConfigChange{ 214 Replica: pb.Replica{ 215 ShardID: 3, 216 }, 217 }, 218 } 219 require.NoError(t, 220 s.store.addScheduleCommands(ctx, 1, []pb.ScheduleCommand{sc1, sc2, sc3})) 221 resp := s.handleLogHeartbeat(ctx, req) 222 require.Equal(t, []pb.ScheduleCommand{sc1, sc3}, resp.CommandBatch.Commands) 223 } 224 runServiceTest(t, true, true, fn) 225 } 226 227 func TestServiceHandleCNHeartbeat(t *testing.T) { 228 fn := func(t *testing.T, s *Service) { 229 ctx, cancel := context.WithTimeout(context.Background(), time.Second) 230 defer cancel() 231 232 req := pb.Request{ 233 Method: pb.CN_HEARTBEAT, 234 CNHeartbeat: &pb.CNStoreHeartbeat{ 235 UUID: "uuid1", 236 }, 237 } 238 resp := s.handleCNHeartbeat(ctx, req) 239 assert.Equal(t, &pb.CommandBatch{}, resp.CommandBatch) 240 assert.Equal(t, uint32(moerr.Ok), resp.ErrorCode) 241 } 242 runServiceTest(t, true, true, fn) 243 } 244 245 func TestServiceHandleDNHeartbeat(t *testing.T) { 246 fn := func(t *testing.T, s *Service) { 247 ctx, cancel := context.WithTimeout(context.Background(), time.Second) 248 defer cancel() 249 250 req := pb.Request{ 251 Method: pb.DN_HEARTBEAT, 252 DNHeartbeat: &pb.DNStoreHeartbeat{ 253 UUID: "uuid1", 254 }, 255 } 256 sc1 := pb.ScheduleCommand{ 257 UUID: "uuid1", 258 ConfigChange: &pb.ConfigChange{ 259 Replica: pb.Replica{ 260 ShardID: 1, 261 }, 262 }, 263 } 264 sc2 := pb.ScheduleCommand{ 265 UUID: "uuid2", 266 ConfigChange: &pb.ConfigChange{ 267 Replica: pb.Replica{ 268 ShardID: 2, 269 }, 270 }, 271 } 272 sc3 := pb.ScheduleCommand{ 273 UUID: "uuid1", 274 ConfigChange: &pb.ConfigChange{ 275 Replica: pb.Replica{ 276 ShardID: 3, 277 }, 278 }, 279 } 280 require.NoError(t, 281 s.store.addScheduleCommands(ctx, 1, []pb.ScheduleCommand{sc1, sc2, sc3})) 282 resp := s.handleDNHeartbeat(ctx, req) 283 require.Equal(t, []pb.ScheduleCommand{sc1, sc3}, resp.CommandBatch.Commands) 284 } 285 runServiceTest(t, true, true, fn) 286 } 287 288 func TestServiceHandleAppend(t *testing.T) { 289 fn := func(t *testing.T, s *Service) { 290 ctx, cancel := context.WithTimeout(context.Background(), time.Second) 291 defer cancel() 292 293 req := pb.Request{ 294 Method: pb.CONNECT_RO, 295 LogRequest: pb.LogRequest{ 296 ShardID: 1, 297 DNID: 100, 298 }, 299 } 300 resp := s.handleConnect(ctx, req) 301 assert.Equal(t, uint32(moerr.Ok), resp.ErrorCode) 302 303 data := make([]byte, 8) 304 cmd := getTestAppendCmd(req.LogRequest.DNID, data) 305 req = pb.Request{ 306 Method: pb.APPEND, 307 LogRequest: pb.LogRequest{ 308 ShardID: 1, 309 }, 310 } 311 resp = s.handleAppend(ctx, req, cmd) 312 assert.Equal(t, uint32(moerr.Ok), resp.ErrorCode) 313 assert.Equal(t, uint64(4), resp.LogResponse.Lsn) 314 } 315 runServiceTest(t, false, true, fn) 316 } 317 318 func TestServiceHandleAppendWhenNotBeingTheLeaseHolder(t *testing.T) { 319 fn := func(t *testing.T, s *Service) { 320 ctx, cancel := context.WithTimeout(context.Background(), time.Second) 321 defer cancel() 322 323 req := pb.Request{ 324 Method: pb.CONNECT_RO, 325 LogRequest: pb.LogRequest{ 326 ShardID: 1, 327 DNID: 100, 328 }, 329 } 330 resp := s.handleConnect(ctx, req) 331 assert.Equal(t, uint32(moerr.Ok), resp.ErrorCode) 332 333 data := make([]byte, 8) 334 cmd := getTestAppendCmd(req.LogRequest.DNID+1, data) 335 req = pb.Request{ 336 Method: pb.APPEND, 337 LogRequest: pb.LogRequest{ 338 ShardID: 1, 339 }, 340 } 341 resp = s.handleAppend(ctx, req, cmd) 342 assert.Equal(t, uint32(moerr.ErrNotLeaseHolder), resp.ErrorCode) 343 assert.Equal(t, uint64(0), resp.LogResponse.Lsn) 344 } 345 runServiceTest(t, false, true, fn) 346 } 347 348 func TestServiceHandleRead(t *testing.T) { 349 fn := func(t *testing.T, s *Service) { 350 ctx, cancel := context.WithTimeout(context.Background(), time.Second) 351 defer cancel() 352 353 req := pb.Request{ 354 Method: pb.CONNECT_RO, 355 LogRequest: pb.LogRequest{ 356 ShardID: 1, 357 DNID: 100, 358 }, 359 } 360 resp := s.handleConnect(ctx, req) 361 assert.Equal(t, uint32(moerr.Ok), resp.ErrorCode) 362 363 data := make([]byte, 8) 364 cmd := getTestAppendCmd(req.LogRequest.DNID, data) 365 req = pb.Request{ 366 Method: pb.APPEND, 367 LogRequest: pb.LogRequest{ 368 ShardID: 1, 369 }, 370 } 371 resp = s.handleAppend(ctx, req, cmd) 372 assert.Equal(t, uint32(moerr.Ok), resp.ErrorCode) 373 assert.Equal(t, uint64(4), resp.LogResponse.Lsn) 374 375 req = pb.Request{ 376 Method: pb.READ, 377 LogRequest: pb.LogRequest{ 378 ShardID: 1, 379 Lsn: 1, 380 MaxSize: 1024 * 32, 381 }, 382 } 383 resp, records := s.handleRead(ctx, req) 384 assert.Equal(t, uint32(moerr.Ok), resp.ErrorCode) 385 assert.Equal(t, uint64(1), resp.LogResponse.LastLsn) 386 require.Equal(t, 4, len(records.Records)) 387 assert.Equal(t, pb.Internal, records.Records[0].Type) 388 assert.Equal(t, pb.Internal, records.Records[1].Type) 389 assert.Equal(t, pb.LeaseUpdate, records.Records[2].Type) 390 assert.Equal(t, pb.UserRecord, records.Records[3].Type) 391 assert.Equal(t, cmd, records.Records[3].Data) 392 } 393 runServiceTest(t, false, true, fn) 394 } 395 396 func TestServiceTruncate(t *testing.T) { 397 fn := func(t *testing.T, s *Service) { 398 ctx, cancel := context.WithTimeout(context.Background(), time.Second) 399 defer cancel() 400 401 req := pb.Request{ 402 Method: pb.CONNECT_RO, 403 LogRequest: pb.LogRequest{ 404 ShardID: 1, 405 DNID: 100, 406 }, 407 } 408 resp := s.handleConnect(ctx, req) 409 assert.Equal(t, uint32(moerr.Ok), resp.ErrorCode) 410 411 data := make([]byte, 8) 412 cmd := getTestAppendCmd(req.LogRequest.DNID, data) 413 req = pb.Request{ 414 Method: pb.APPEND, 415 LogRequest: pb.LogRequest{ 416 ShardID: 1, 417 }, 418 } 419 resp = s.handleAppend(ctx, req, cmd) 420 assert.Equal(t, uint32(moerr.Ok), resp.ErrorCode) 421 assert.Equal(t, uint64(4), resp.LogResponse.Lsn) 422 423 req = pb.Request{ 424 Method: pb.TRUNCATE, 425 LogRequest: pb.LogRequest{ 426 ShardID: 1, 427 Lsn: 4, 428 }, 429 } 430 resp = s.handleTruncate(ctx, req) 431 assert.Equal(t, uint32(moerr.Ok), resp.ErrorCode) 432 assert.Equal(t, uint64(0), resp.LogResponse.Lsn) 433 434 req = pb.Request{ 435 Method: pb.GET_TRUNCATE, 436 LogRequest: pb.LogRequest{ 437 ShardID: 1, 438 }, 439 } 440 resp = s.handleGetTruncatedIndex(ctx, req) 441 assert.Equal(t, uint32(moerr.Ok), resp.ErrorCode) 442 assert.Equal(t, uint64(4), resp.LogResponse.Lsn) 443 444 req = pb.Request{ 445 Method: pb.TRUNCATE, 446 LogRequest: pb.LogRequest{ 447 ShardID: 1, 448 Lsn: 3, 449 }, 450 } 451 resp = s.handleTruncate(ctx, req) 452 assert.Equal(t, uint32(moerr.ErrInvalidTruncateLsn), resp.ErrorCode) 453 } 454 runServiceTest(t, false, true, fn) 455 } 456 457 func TestServiceTsoUpdate(t *testing.T) { 458 fn := func(t *testing.T, s *Service) { 459 ctx, cancel := context.WithTimeout(context.Background(), time.Second) 460 defer cancel() 461 462 req := pb.Request{ 463 Method: pb.TSO_UPDATE, 464 TsoRequest: &pb.TsoRequest{ 465 Count: 100, 466 }, 467 } 468 resp := s.handleTsoUpdate(ctx, req) 469 assert.Equal(t, uint32(moerr.Ok), resp.ErrorCode) 470 assert.Equal(t, uint64(1), resp.TsoResponse.Value) 471 472 req.TsoRequest.Count = 1000 473 resp = s.handleTsoUpdate(ctx, req) 474 assert.Equal(t, uint32(moerr.Ok), resp.ErrorCode) 475 assert.Equal(t, uint64(101), resp.TsoResponse.Value) 476 477 resp = s.handleTsoUpdate(ctx, req) 478 assert.Equal(t, uint32(moerr.Ok), resp.ErrorCode) 479 assert.Equal(t, uint64(1101), resp.TsoResponse.Value) 480 } 481 runServiceTest(t, false, true, fn) 482 } 483 484 func TestServiceCheckHAKeeper(t *testing.T) { 485 fn := func(t *testing.T, s *Service) { 486 ctx, cancel := context.WithTimeout(context.Background(), time.Second) 487 defer cancel() 488 489 req := pb.Request{ 490 Method: pb.CHECK_HAKEEPER, 491 } 492 resp := s.handleCheckHAKeeper(ctx, req) 493 assert.Equal(t, uint32(moerr.Ok), resp.ErrorCode) 494 assert.False(t, resp.IsHAKeeper) 495 } 496 runServiceTest(t, false, false, fn) 497 498 fn = func(t *testing.T, s *Service) { 499 ctx, cancel := context.WithTimeout(context.Background(), time.Second) 500 defer cancel() 501 502 init := make(map[uint64]dragonboat.Target) 503 init[1] = s.ID() 504 require.NoError(t, s.store.startHAKeeperReplica(1, init, false)) 505 req := pb.Request{ 506 Method: pb.CHECK_HAKEEPER, 507 } 508 resp := s.handleCheckHAKeeper(ctx, req) 509 assert.Equal(t, uint32(moerr.Ok), resp.ErrorCode) 510 assert.True(t, resp.IsHAKeeper) 511 } 512 runServiceTest(t, false, false, fn) 513 } 514 515 func TestShardInfoCanBeQueried(t *testing.T) { 516 defer leaktest.AfterTest(t)() 517 cfg1 := Config{ 518 UUID: uuid.New().String(), 519 FS: vfs.NewStrictMem(), 520 DeploymentID: 1, 521 RTTMillisecond: 5, 522 DataDir: "data-1", 523 ServiceAddress: "127.0.0.1:9002", 524 RaftAddress: "127.0.0.1:9000", 525 GossipAddress: "127.0.0.1:9001", 526 GossipSeedAddresses: []string{"127.0.0.1:9011"}, 527 DisableWorkers: true, 528 } 529 cfg2 := Config{ 530 UUID: uuid.New().String(), 531 FS: vfs.NewStrictMem(), 532 DeploymentID: 1, 533 RTTMillisecond: 5, 534 DataDir: "data-2", 535 ServiceAddress: "127.0.0.1:9012", 536 RaftAddress: "127.0.0.1:9010", 537 GossipAddress: "127.0.0.1:9011", 538 GossipSeedAddresses: []string{"127.0.0.1:9001"}, 539 DisableWorkers: true, 540 } 541 cfg1.Fill() 542 service1, err := NewService(cfg1, 543 testutil.NewFS(), 544 WithBackendFilter(func(msg morpc.Message, backendAddr string) bool { 545 return true 546 }), 547 ) 548 require.NoError(t, err) 549 defer func() { 550 assert.NoError(t, service1.Close()) 551 }() 552 peers1 := make(map[uint64]dragonboat.Target) 553 peers1[1] = service1.ID() 554 assert.NoError(t, service1.store.startReplica(1, 1, peers1, false)) 555 cfg2.Fill() 556 service2, err := NewService(cfg2, 557 testutil.NewFS(), 558 WithBackendFilter(func(msg morpc.Message, backendAddr string) bool { 559 return true 560 }), 561 ) 562 require.NoError(t, err) 563 defer func() { 564 assert.NoError(t, service2.Close()) 565 }() 566 peers2 := make(map[uint64]dragonboat.Target) 567 peers2[1] = service2.ID() 568 assert.NoError(t, service2.store.startReplica(2, 1, peers2, false)) 569 570 nhID1 := service1.ID() 571 nhID2 := service2.ID() 572 573 done := false 574 575 // FIXME: 576 // as per #3478, this test is flaky, increased loop count to 6000 to 577 // see whether gossip can finish syncing in 6 seconds time. also added some 578 // logging to get collect more details 579 for i := 0; i < 6000; i++ { 580 si1, ok := service1.getShardInfo(1) 581 if !ok || si1.LeaderID != 1 { 582 testLogger.Error("shard 1 info missing on service 1") 583 time.Sleep(time.Millisecond) 584 continue 585 } 586 assert.Equal(t, 1, len(si1.Replicas)) 587 require.Equal(t, uint64(1), si1.ShardID) 588 ri, ok := si1.Replicas[1] 589 assert.True(t, ok) 590 assert.Equal(t, nhID1, ri.UUID) 591 assert.Equal(t, cfg1.ServiceAddress, ri.ServiceAddress) 592 593 si2, ok := service1.getShardInfo(2) 594 if !ok || si2.LeaderID != 1 { 595 testLogger.Error("shard 2 info missing on service 1") 596 time.Sleep(time.Millisecond) 597 continue 598 } 599 assert.Equal(t, 1, len(si2.Replicas)) 600 require.Equal(t, uint64(2), si2.ShardID) 601 ri, ok = si2.Replicas[1] 602 assert.True(t, ok) 603 assert.Equal(t, nhID2, ri.UUID) 604 assert.Equal(t, cfg2.ServiceAddress, ri.ServiceAddress) 605 606 si1, ok = service2.getShardInfo(1) 607 if !ok || si1.LeaderID != 1 { 608 testLogger.Error("shard 1 info missing on service 2") 609 time.Sleep(time.Millisecond) 610 continue 611 } 612 assert.Equal(t, 1, len(si1.Replicas)) 613 require.Equal(t, uint64(1), si1.ShardID) 614 ri, ok = si1.Replicas[1] 615 assert.True(t, ok) 616 assert.Equal(t, nhID1, ri.UUID) 617 assert.Equal(t, cfg1.ServiceAddress, ri.ServiceAddress) 618 619 si2, ok = service2.getShardInfo(2) 620 if !ok || si2.LeaderID != 1 { 621 testLogger.Error("shard 2 info missing on service 2") 622 time.Sleep(time.Millisecond) 623 continue 624 } 625 assert.Equal(t, 1, len(si2.Replicas)) 626 require.Equal(t, uint64(2), si2.ShardID) 627 ri, ok = si2.Replicas[1] 628 assert.True(t, ok) 629 assert.Equal(t, nhID2, ri.UUID) 630 assert.Equal(t, cfg2.ServiceAddress, ri.ServiceAddress) 631 632 done = true 633 break 634 } 635 assert.True(t, done) 636 } 637 638 func TestGossipInSimulatedCluster(t *testing.T) { 639 defer leaktest.AfterTest(t)() 640 debug.SetMemoryLimit(1 << 30) 641 // start all services 642 nodeCount := 24 643 shardCount := nodeCount / 3 644 configs := make([]Config, 0) 645 services := make([]*Service, 0) 646 for i := 0; i < nodeCount; i++ { 647 cfg := Config{ 648 FS: vfs.NewStrictMem(), 649 UUID: uuid.New().String(), 650 DeploymentID: 1, 651 RTTMillisecond: 200, 652 DataDir: fmt.Sprintf("data-%d", i), 653 ServiceAddress: fmt.Sprintf("127.0.0.1:%d", 26000+10*i), 654 RaftAddress: fmt.Sprintf("127.0.0.1:%d", 26000+10*i+1), 655 GossipAddress: fmt.Sprintf("127.0.0.1:%d", 26000+10*i+2), 656 GossipSeedAddresses: []string{ 657 "127.0.0.1:26002", 658 "127.0.0.1:26012", 659 "127.0.0.1:26022", 660 "127.0.0.1:26032", 661 "127.0.0.1:26042", 662 "127.0.0.1:26052", 663 "127.0.0.1:26062", 664 "127.0.0.1:26072", 665 "127.0.0.1:26082", 666 "127.0.0.1:26092", 667 }, 668 DisableWorkers: true, 669 LogDBBufferSize: 1024 * 16, 670 } 671 cfg.GossipProbeInterval.Duration = 350 * time.Millisecond 672 configs = append(configs, cfg) 673 service, err := NewService(cfg, 674 testutil.NewFS(), 675 WithBackendFilter(func(msg morpc.Message, backendAddr string) bool { 676 return true 677 }), 678 ) 679 require.NoError(t, err) 680 services = append(services, service) 681 } 682 defer func() { 683 testLogger.Info("going to close all services") 684 var wg sync.WaitGroup 685 for _, s := range services { 686 if s != nil { 687 selected := s 688 wg.Add(1) 689 go func() { 690 require.NoError(t, selected.Close()) 691 wg.Done() 692 testLogger.Info("closed a service") 693 }() 694 } 695 } 696 wg.Wait() 697 time.Sleep(time.Second * 2) 698 }() 699 // start all replicas 700 // shardID: [1, 16] 701 id := uint64(100) 702 for i := uint64(0); i < uint64(shardCount); i++ { 703 shardID := i + 1 704 r1 := id 705 r2 := id + 1 706 r3 := id + 2 707 id += 3 708 replicas := make(map[uint64]dragonboat.Target) 709 replicas[r1] = services[i*3].ID() 710 replicas[r2] = services[i*3+1].ID() 711 replicas[r3] = services[i*3+2].ID() 712 require.NoError(t, services[i*3+0].store.startReplica(shardID, r1, replicas, false)) 713 require.NoError(t, services[i*3+1].store.startReplica(shardID, r2, replicas, false)) 714 require.NoError(t, services[i*3+2].store.startReplica(shardID, r3, replicas, false)) 715 } 716 wait := func() { 717 time.Sleep(50 * time.Millisecond) 718 } 719 // check & wait all leaders to be elected and known to all services 720 cci := uint64(0) 721 iterations := 1000 722 for retry := 0; retry < iterations; retry++ { 723 notReady := 0 724 for i := 0; i < nodeCount; i++ { 725 shardID := uint64(i/3 + 1) 726 service := services[i] 727 info, ok := service.getShardInfo(shardID) 728 if !ok || info.LeaderID == 0 { 729 notReady++ 730 wait() 731 continue 732 } 733 if shardID == 1 && info.Epoch != 0 { 734 cci = info.Epoch 735 } 736 } 737 if notReady <= 1 { 738 break 739 } 740 require.True(t, retry < iterations-1) 741 } 742 require.True(t, cci != 0) 743 // all good now, add a replica to shard 1 744 id += 1 745 746 for i := 0; i < iterations; i++ { 747 err := services[0].store.addReplica(1, id, services[3].ID(), cci) 748 if err == nil { 749 break 750 } else if err == dragonboat.ErrTimeout || err == dragonboat.ErrSystemBusy || 751 err == dragonboat.ErrInvalidDeadline || err == dragonboat.ErrTimeoutTooSmall { 752 info, ok := services[0].getShardInfo(1) 753 if ok && info.LeaderID != 0 && len(info.Replicas) == 4 { 754 break 755 } 756 wait() 757 continue 758 } else if err == dragonboat.ErrRejected { 759 break 760 } 761 t.Fatalf("failed to add replica, %v", err) 762 } 763 764 // check the above change can be observed by all services 765 for retry := 0; retry < iterations; retry++ { 766 notReady := 0 767 for i := 0; i < nodeCount; i++ { 768 service := services[i] 769 info, ok := service.getShardInfo(1) 770 if !ok || info.LeaderID == 0 || len(info.Replicas) != 4 { 771 notReady++ 772 wait() 773 continue 774 } 775 } 776 if notReady <= 1 { 777 break 778 } 779 require.True(t, retry < iterations-1) 780 } 781 // restart a service, watch how long will it take to get all required 782 // shard info 783 require.NoError(t, services[12].Close()) 784 services[12] = nil 785 time.Sleep(2 * time.Second) 786 service, err := NewService(configs[12], 787 testutil.NewFS(), 788 WithBackendFilter(func(msg morpc.Message, backendAddr string) bool { 789 return true 790 }), 791 ) 792 require.NoError(t, err) 793 defer func() { 794 require.NoError(t, service.Close()) 795 }() 796 for retry := 0; retry < iterations; retry++ { 797 notReady := 0 798 for i := uint64(0); i < uint64(shardCount); i++ { 799 shardID := i + 1 800 info, ok := service.getShardInfo(shardID) 801 if !ok || info.LeaderID == 0 { 802 notReady++ 803 wait() 804 continue 805 } 806 } 807 if notReady <= 1 { 808 break 809 } 810 require.True(t, retry < iterations-1) 811 } 812 }