github.com/matrixorigin/matrixone@v1.2.0/pkg/tests/service/service.go (about) 1 // Copyright 2021 - 2022 Matrix Origin 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package service 16 17 import ( 18 "context" 19 "os" 20 "path/filepath" 21 "sync" 22 "testing" 23 "time" 24 25 "github.com/google/uuid" 26 "github.com/matrixorigin/matrixone/pkg/cnservice" 27 "github.com/matrixorigin/matrixone/pkg/common/moerr" 28 "github.com/matrixorigin/matrixone/pkg/common/morpc" 29 "github.com/matrixorigin/matrixone/pkg/common/runtime" 30 "github.com/matrixorigin/matrixone/pkg/common/stopper" 31 "github.com/matrixorigin/matrixone/pkg/defines" 32 "github.com/matrixorigin/matrixone/pkg/fileservice" 33 "github.com/matrixorigin/matrixone/pkg/hakeeper" 34 "github.com/matrixorigin/matrixone/pkg/hakeeper/checkers/syshealth" 35 "github.com/matrixorigin/matrixone/pkg/logservice" 36 "github.com/matrixorigin/matrixone/pkg/logutil" 37 logpb "github.com/matrixorigin/matrixone/pkg/pb/logservice" 38 "github.com/matrixorigin/matrixone/pkg/pb/metadata" 39 "github.com/matrixorigin/matrixone/pkg/testutil" 40 "github.com/matrixorigin/matrixone/pkg/tnservice" 41 "github.com/matrixorigin/matrixone/pkg/txn/clock" 42 "github.com/stretchr/testify/assert" 43 "github.com/stretchr/testify/require" 44 "go.uber.org/zap" 45 ) 46 47 var ( 48 defaultWaitInterval = 100 * time.Millisecond 49 defaultTestTimeout = 3 * time.Minute 50 ) 51 52 // Cluster describes behavior of test framework. 53 type Cluster interface { 54 // Start starts svcs sequentially, after start, system init is completed. 55 Start() error 56 // Close stops svcs sequentially 57 Close() error 58 // Options returns the adjusted options 59 Options() Options 60 // Clock get cluster clock 61 Clock() clock.Clock 62 63 ClusterOperation 64 ClusterAwareness 65 ClusterState 66 ClusterWaitState 67 } 68 69 // ClusterOperation supports kinds of cluster operations. 70 type ClusterOperation interface { 71 // CloseTNService closes tn service by uuid. 72 CloseTNService(uuid string) error 73 // StartTNService starts tn service by uuid. 74 StartTNService(uuid string) error 75 76 // CloseTNServiceIndexed closes tn service by its index. 77 CloseTNServiceIndexed(index int) error 78 // StartTNServiceIndexed starts tn service by its index. 79 StartTNServiceIndexed(index int) error 80 81 // CloseLogService closes log service by uuid. 82 CloseLogService(uuid string) error 83 // StartLogService starts log service by uuid. 84 StartLogService(uuid string) error 85 86 // CloseLogServiceIndexed closes log service by its index. 87 CloseLogServiceIndexed(index int) error 88 // StartLogServiceIndexed starts log service by its index. 89 StartLogServiceIndexed(index int) error 90 91 // CloseCNService closes cn service by uuid. 92 CloseCNService(uuid string) error 93 // StartCNService starts cn service by uuid. 94 StartCNService(uuid string) error 95 96 // CloseCNServiceIndexed closes cn service by its index. 97 CloseCNServiceIndexed(index int) error 98 // StartCNServiceIndexed starts cn service by its index. 99 StartCNServiceIndexed(index int) error 100 101 // StartCNServices start number of cn services. 102 StartCNServices(n int) error 103 104 // NewNetworkPartition constructs network partition from service index. 105 NewNetworkPartition(tnIndexes, logIndexes, cnIndexes []uint32) NetworkPartition 106 // RemainingNetworkPartition returns partition for the remaining services. 107 RemainingNetworkPartition(partitions ...NetworkPartition) NetworkPartition 108 // StartNetworkPartition enables network partition feature. 109 StartNetworkPartition(partitions ...NetworkPartition) 110 // CloseNetworkPartition disables network partition feature. 111 CloseNetworkPartition() 112 } 113 114 // ClusterAwareness provides cluster awareness information. 115 type ClusterAwareness interface { 116 // ListTNServices lists uuid of all tn services. 117 ListTNServices() []string 118 // ListLogServices lists uuid of all log services. 119 ListLogServices() []string 120 // ListCnServices lists uuid of all cn services. 121 ListCnServices() []string 122 // ListHAKeeperServices lists all hakeeper log services. 123 ListHAKeeperServices() []LogService 124 125 // GetTNService fetches tn service instance by uuid. 126 GetTNService(uuid string) (TNService, error) 127 // GetLogService fetches log service instance by index. 128 GetLogService(uuid string) (LogService, error) 129 // GetTNServiceIndexed fetches tn service instance by uuid. 130 GetTNServiceIndexed(index int) (TNService, error) 131 // GetLogServiceIndexed fetches log service instance by index. 132 GetLogServiceIndexed(index int) (LogService, error) 133 // GetCNService fetches cn service instance by index. 134 GetCNService(uuid string) (CNService, error) 135 // GetCNServiceIndexed fetches cn service instance by index. 136 GetCNServiceIndexed(index int) (CNService, error) 137 138 // GetClusterState fetches current cluster state 139 GetClusterState(ctx context.Context) (*logpb.CheckerState, error) 140 } 141 142 // ClusterState provides cluster running state. 143 type ClusterState interface { 144 // ListTNShards lists all tn shards within the cluster. 145 ListTNShards(ctx context.Context) ([]metadata.TNShardRecord, error) 146 // ListLogShards lists all log shards within the cluster. 147 ListLogShards(ctx context.Context) ([]metadata.LogShardRecord, error) 148 149 // GetTNStoreInfo gets tn store information by uuid. 150 GetTNStoreInfo(ctx context.Context, uuid string) (logpb.TNStoreInfo, error) 151 // GetTNStoreInfoIndexed gets tn store information by index. 152 GetTNStoreInfoIndexed(ctx context.Context, index int) (logpb.TNStoreInfo, error) 153 154 // GetLogStoreInfo gets log store information by uuid. 155 GetLogStoreInfo(ctx context.Context, uuid string) (logpb.LogStoreInfo, error) 156 // GetLogStoreInfoIndexed gets log store information by index. 157 GetLogStoreInfoIndexed(ctx context.Context, index int) (logpb.LogStoreInfo, error) 158 159 // GetCNStoreInfo gets cn store information by uuid. 160 GetCNStoreInfo(ctx context.Context, uuid string) (logpb.CNStoreInfo, error) 161 // GetCNStoreInfoIndexed gets cn store information by index. 162 GetCNStoreInfoIndexed(ctx context.Context, index int) (logpb.CNStoreInfo, error) 163 164 // GetHAKeeperState returns hakeeper state from running hakeeper. 165 GetHAKeeperState() logpb.HAKeeperState 166 // GetHAKeeperConfig returns hakeeper configuration. 167 GetHAKeeperConfig() hakeeper.Config 168 169 // TNStoreExpired checks tn store expired or not by uuid. 170 TNStoreExpired(uuid string) (bool, error) 171 // TNStoreExpiredIndexed checks tn store expired or not by index. 172 TNStoreExpiredIndexed(index int) (bool, error) 173 // LogStoreExpired checks log store expired or not by uuid. 174 LogStoreExpired(uuid string) (bool, error) 175 // LogStoreExpiredIndexed checks log store expired or not by index. 176 LogStoreExpiredIndexed(index int) (bool, error) 177 // CNStoreExpired checks cn store expired or not by uuid. 178 CNStoreExpired(uuid string) (bool, error) 179 // CNStoreExpiredIndexed checks cn store expired or not by index. 180 CNStoreExpiredIndexed(index int) (bool, error) 181 182 // IsClusterHealthy checks whether cluster is healthy or not. 183 IsClusterHealthy() bool 184 } 185 186 // ClusterWaitState waits cluster state until timeout. 187 type ClusterWaitState interface { 188 // WaitHAKeeperLeader waits hakeeper leader elected and return it. 189 WaitHAKeeperLeader(ctx context.Context) LogService 190 // WaitHAKeeperState waits the specific hakeeper state. 191 WaitHAKeeperState(ctx context.Context, expected logpb.HAKeeperState) 192 193 // WaitTNShardsReported waits the expected count of tn shards reported. 194 WaitTNShardsReported(ctx context.Context) 195 // WaitLogShardsReported waits the expected count of log shards reported. 196 WaitLogShardsReported(ctx context.Context) 197 // WaitTNReplicaReported waits tn replica reported. 198 WaitTNReplicaReported(ctx context.Context, shardID uint64) 199 // WaitLogReplicaReported waits log replicas reported. 200 WaitLogReplicaReported(ctx context.Context, shardID uint64) 201 202 // WaitTNStoreTimeout waits tn store timeout by uuid. 203 WaitTNStoreTimeout(ctx context.Context, uuid string) 204 // WaitTNStoreTimeoutIndexed waits tn store timeout by index. 205 WaitTNStoreTimeoutIndexed(ctx context.Context, index int) 206 // WaitTNStoreReported waits tn store reported by uuid. 207 WaitTNStoreReported(ctx context.Context, uuid string) 208 // WaitTNStoreReportedIndexed waits tn store reported by index. 209 WaitTNStoreReportedIndexed(ctx context.Context, index int) 210 // WaitTNStoreTaskServiceCreated waits tn store task service started by uuid. 211 WaitTNStoreTaskServiceCreated(ctx context.Context, uuid string) 212 // WaitTNStoreTaskServiceCreatedIndexed waits tn store task service started by index. 213 WaitTNStoreTaskServiceCreatedIndexed(ctx context.Context, index int) 214 // WaitCNStoreReported waits cn store reported by uuid. 215 WaitCNStoreReported(ctx context.Context, uuid string) 216 // WaitCNStoreReportedIndexed waits cn store reported by index. 217 WaitCNStoreReportedIndexed(ctx context.Context, index int) 218 // WaitCNStoreTaskServiceCreated waits cn store task service started by uuid. 219 WaitCNStoreTaskServiceCreated(ctx context.Context, uuid string) 220 // WaitCNStoreTaskServiceCreatedIndexed waits cn store task service started by index. 221 WaitCNStoreTaskServiceCreatedIndexed(ctx context.Context, index int) 222 // WaitLogStoreTaskServiceCreated waits log store task service started by uuid 223 WaitLogStoreTaskServiceCreated(ctx context.Context, uuid string) 224 // WaitLogStoreTaskServiceCreatedIndexed waits log store task service started by index 225 WaitLogStoreTaskServiceCreatedIndexed(ctx context.Context, index int) 226 227 // WaitLogStoreTimeout waits log store timeout by uuid. 228 WaitLogStoreTimeout(ctx context.Context, uuid string) 229 // WaitLogStoreTimeoutIndexed waits log store timeout by index. 230 WaitLogStoreTimeoutIndexed(ctx context.Context, index int) 231 // WaitLogStoreReported waits log store reported by uuid. 232 WaitLogStoreReported(ctx context.Context, uuid string) 233 // WaitLogStoreReportedIndexed waits log store reported by index. 234 WaitLogStoreReportedIndexed(ctx context.Context, index int) 235 } 236 237 // ---------------------------------------------------- 238 // The following are implements for interface `Cluster`. 239 // ---------------------------------------------------- 240 241 // testCluster simulates a cluster with tn and log service. 242 type testCluster struct { 243 t *testing.T 244 testID string 245 opt Options 246 logger *zap.Logger 247 stopper *stopper.Stopper 248 clock clock.Clock 249 250 tn struct { 251 sync.Mutex 252 cfgs []*tnservice.Config 253 opts []tnOptions 254 svcs []TNService 255 } 256 257 log struct { 258 once sync.Once 259 260 sync.Mutex 261 cfgs []logservice.Config 262 opts []logOptions 263 svcs []LogService 264 } 265 266 cn struct { 267 sync.Mutex 268 cfgs []*cnservice.Config 269 opts []cnOptions 270 svcs []CNService 271 } 272 273 network struct { 274 addresses *serviceAddresses 275 276 sync.RWMutex 277 addressSets []addressSet 278 } 279 280 fileservices *fileServices 281 282 mu struct { 283 sync.Mutex 284 running bool 285 } 286 } 287 288 // NewCluster construct a cluster for integration test. 289 func NewCluster(ctx context.Context, t *testing.T, opt Options) (Cluster, error) { 290 logutil.SetupMOLogger(&logutil.LogConfig{ 291 Level: "fatal", 292 Format: "console", 293 }) 294 opt.validate() 295 296 uid, _ := uuid.NewV7() 297 c := &testCluster{ 298 t: t, 299 testID: uid.String(), 300 opt: opt, 301 stopper: stopper.NewStopper("test-cluster"), 302 } 303 c.logger = logutil.Adjust(opt.logger).With(zap.String("testcase", t.Name())).With(zap.String("test-id", c.testID)) 304 c.opt.rootDataDir = filepath.Join(c.opt.rootDataDir, c.testID, t.Name()) 305 if c.clock == nil { 306 c.clock = clock.NewUnixNanoHLCClockWithStopper(c.stopper, 0) 307 } 308 309 // TODO: CN and LOG use process level runtime 310 runtime.SetupProcessLevelRuntime(c.newRuntime()) 311 312 // build addresses for all services 313 c.network.addresses = c.buildServiceAddresses() 314 // build log service configurations 315 c.log.cfgs, c.log.opts = c.buildLogConfigs() 316 // build tn service configurations 317 c.tn.cfgs, c.tn.opts = c.buildTNConfigs() 318 319 // build FileService instances 320 c.fileservices = c.buildFileServices(ctx) 321 322 // build cn service configurations 323 c.buildCNConfigs(c.opt.initial.cnServiceNum) 324 return c, nil 325 } 326 327 func (c *testCluster) Start() error { 328 c.mu.Lock() 329 defer c.mu.Unlock() 330 331 if c.mu.running { 332 return nil 333 } 334 335 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 336 defer cancel() 337 338 c.mu.running = true 339 // start log services first 340 if err := c.startLogServices(ctx); err != nil { 341 return err 342 } 343 344 // start tn services 345 if err := c.startTNServices(ctx); err != nil { 346 return err 347 } 348 349 // start cn services 350 if err := c.startCNServices(ctx); err != nil { 351 return err 352 } 353 354 return nil 355 } 356 357 func (c *testCluster) Options() Options { 358 return c.opt 359 } 360 361 func (c *testCluster) Clock() clock.Clock { 362 return c.clock 363 } 364 365 func (c *testCluster) Close() error { 366 defer logutil.LogClose(c.logger, "tests-framework")() 367 c.logger.Info("closing testCluster") 368 369 c.mu.Lock() 370 defer c.mu.Unlock() 371 372 if !c.mu.running { 373 return nil 374 } 375 376 // close all cn services first 377 if err := c.closeCNServices(); err != nil { 378 return err 379 } 380 381 // close all tn services 382 if err := c.closeTNServices(); err != nil { 383 return err 384 } 385 386 // close all log services 387 if err := c.closeLogServices(); err != nil { 388 return err 389 } 390 391 c.mu.running = false 392 c.stopper.Stop() 393 394 if !c.opt.keepData { 395 if err := os.RemoveAll(c.opt.rootDataDir); err != nil { 396 return err 397 } 398 } 399 return nil 400 } 401 402 // ---------------------------------------------------------- 403 // The following are implements for interface `ClusterState`. 404 // ---------------------------------------------------------- 405 func (c *testCluster) ListTNShards( 406 ctx context.Context, 407 ) ([]metadata.TNShardRecord, error) { 408 state, err := c.GetClusterState(ctx) 409 if err != nil { 410 return nil, err 411 } 412 return state.ClusterInfo.TNShards, nil 413 } 414 415 func (c *testCluster) ListLogShards( 416 ctx context.Context, 417 ) ([]metadata.LogShardRecord, error) { 418 state, err := c.GetClusterState(ctx) 419 if err != nil { 420 return nil, err 421 } 422 return state.ClusterInfo.LogShards, nil 423 } 424 425 func (c *testCluster) GetTNStoreInfo( 426 ctx context.Context, uuid string, 427 ) (logpb.TNStoreInfo, error) { 428 state, err := c.GetClusterState(ctx) 429 if err != nil { 430 return logpb.TNStoreInfo{}, err 431 } 432 stores := state.TNState.Stores 433 if storeInfo, ok := stores[uuid]; ok { 434 return storeInfo, nil 435 } 436 return logpb.TNStoreInfo{}, moerr.NewNoService(ctx, uuid) 437 } 438 439 func (c *testCluster) GetTNStoreInfoIndexed( 440 ctx context.Context, index int, 441 ) (logpb.TNStoreInfo, error) { 442 ds, err := c.GetTNServiceIndexed(index) 443 if err != nil { 444 return logpb.TNStoreInfo{}, err 445 } 446 return c.GetTNStoreInfo(ctx, ds.ID()) 447 } 448 449 func (c *testCluster) GetLogStoreInfo( 450 ctx context.Context, uuid string, 451 ) (logpb.LogStoreInfo, error) { 452 state, err := c.GetClusterState(ctx) 453 if err != nil { 454 return logpb.LogStoreInfo{}, err 455 } 456 stores := state.LogState.Stores 457 if storeInfo, ok := stores[uuid]; ok { 458 return storeInfo, nil 459 } 460 return logpb.LogStoreInfo{}, moerr.NewNoService(ctx, uuid) 461 } 462 463 func (c *testCluster) GetLogStoreInfoIndexed( 464 ctx context.Context, index int, 465 ) (logpb.LogStoreInfo, error) { 466 ls, err := c.GetLogServiceIndexed(index) 467 if err != nil { 468 return logpb.LogStoreInfo{}, err 469 } 470 return c.GetLogStoreInfo(ctx, ls.ID()) 471 } 472 473 func (c *testCluster) GetCNStoreInfo(ctx context.Context, uuid string) (logpb.CNStoreInfo, error) { 474 state, err := c.GetClusterState(ctx) 475 if err != nil { 476 return logpb.CNStoreInfo{}, err 477 } 478 stores := state.CNState.Stores 479 if storeInfo, ok := stores[uuid]; ok { 480 return storeInfo, nil 481 } 482 return logpb.CNStoreInfo{}, moerr.NewNoService(ctx, uuid) 483 } 484 485 func (c *testCluster) GetCNStoreInfoIndexed(ctx context.Context, index int) (logpb.CNStoreInfo, error) { 486 ls, err := c.GetCNServiceIndexed(index) 487 if err != nil { 488 return logpb.CNStoreInfo{}, err 489 } 490 return c.GetCNStoreInfo(ctx, ls.ID()) 491 } 492 493 func (c *testCluster) GetHAKeeperState() logpb.HAKeeperState { 494 state := c.getClusterState() 495 require.NotNil(c.t, state) 496 return state.State 497 } 498 499 func (c *testCluster) GetHAKeeperConfig() hakeeper.Config { 500 return c.opt.BuildHAKeeperConfig() 501 } 502 503 func (c *testCluster) TNStoreExpired(uuid string) (bool, error) { 504 state := c.getClusterState() 505 require.NotNil(c.t, state) 506 507 tnStore, ok := state.TNState.Stores[uuid] 508 if !ok { 509 return false, moerr.NewShardNotReportedNoCtx(uuid, 0xDEADBEEF) 510 } 511 512 hkcfg := c.GetHAKeeperConfig() 513 expired := hkcfg.TNStoreExpired(tnStore.Tick, state.Tick) 514 515 c.logger.Info( 516 "check tn store expired or not", 517 zap.Any("hakeeper config", hkcfg), 518 zap.Uint64("dn store tick", tnStore.Tick), 519 zap.Uint64("current tick", state.Tick), 520 zap.Bool("expired", expired), 521 ) 522 523 return expired, nil 524 } 525 526 func (c *testCluster) TNStoreExpiredIndexed(index int) (bool, error) { 527 ds, err := c.GetTNServiceIndexed(index) 528 if err != nil { 529 return false, err 530 } 531 return c.TNStoreExpired(ds.ID()) 532 } 533 534 func (c *testCluster) LogStoreExpired(uuid string) (bool, error) { 535 state := c.getClusterState() 536 require.NotNil(c.t, state) 537 538 logStore, ok := state.LogState.Stores[uuid] 539 if !ok { 540 return false, moerr.NewShardNotReportedNoCtx(uuid, 0xDEADBEEF) 541 } 542 543 hkcfg := c.GetHAKeeperConfig() 544 expired := hkcfg.LogStoreExpired(logStore.Tick, state.Tick) 545 546 c.logger.Info( 547 "check log store expired or not", 548 zap.Any("hakeeper config", hkcfg), 549 zap.Uint64("log store tick", logStore.Tick), 550 zap.Uint64("current tick", state.Tick), 551 zap.Bool("expired", expired), 552 ) 553 554 return expired, nil 555 } 556 557 func (c *testCluster) LogStoreExpiredIndexed(index int) (bool, error) { 558 ls, err := c.GetLogServiceIndexed(index) 559 if err != nil { 560 return false, err 561 } 562 return c.LogStoreExpired(ls.ID()) 563 } 564 565 func (c *testCluster) CNStoreExpired(uuid string) (bool, error) { 566 state := c.getClusterState() 567 require.NotNil(c.t, state) 568 569 cnStore, ok := state.CNState.Stores[uuid] 570 if !ok { 571 return false, moerr.NewShardNotReportedNoCtx(uuid, 0) 572 } 573 574 hkcfg := c.GetHAKeeperConfig() 575 expired := hkcfg.CNStoreExpired(cnStore.Tick, state.Tick) 576 577 c.logger.Info( 578 "check cn store expired or not", 579 zap.Any("hakeeper config", hkcfg), 580 zap.Uint64("cn store tick", cnStore.Tick), 581 zap.Uint64("current tick", state.Tick), 582 zap.Bool("expired", expired), 583 ) 584 585 return expired, nil 586 } 587 588 func (c *testCluster) CNStoreExpiredIndexed(index int) (bool, error) { 589 cs, err := c.GetCNServiceIndexed(index) 590 if err != nil { 591 return false, err 592 } 593 return c.CNStoreExpired(cs.ID()) 594 } 595 596 func (c *testCluster) IsClusterHealthy() bool { 597 hkcfg := c.GetHAKeeperConfig() 598 state := c.getClusterState() 599 _, healthy := syshealth.Check( 600 hkcfg, 601 state.GetClusterInfo(), 602 state.GetTNState(), 603 state.GetLogState(), 604 state.GetTick(), 605 ) 606 return healthy 607 } 608 609 // -------------------------------------------------------------- 610 // The following are implements for interface `ClusterWaitState`. 611 // -------------------------------------------------------------- 612 func (c *testCluster) WaitHAKeeperLeader(ctx context.Context) LogService { 613 for { 614 select { 615 case <-ctx.Done(): 616 assert.FailNow( 617 c.t, 618 "terminated when waiting for hakeeper leader", 619 "error: %s", ctx.Err(), 620 ) 621 default: 622 time.Sleep(defaultWaitInterval) 623 624 leader := c.getHAKeeperLeader() 625 if leader != nil { 626 return leader 627 } 628 } 629 } 630 } 631 632 func (c *testCluster) WaitHAKeeperState( 633 ctx context.Context, expected logpb.HAKeeperState, 634 ) { 635 for { 636 select { 637 case <-ctx.Done(): 638 assert.FailNow( 639 c.t, 640 "terminated when waiting for hakeeper state", 641 "error: %s", ctx.Err(), 642 ) 643 default: 644 time.Sleep(defaultWaitInterval) 645 646 state := c.getClusterState() 647 if state == nil { 648 continue 649 } 650 if state.State == expected { 651 return 652 } 653 } 654 } 655 } 656 657 func (c *testCluster) WaitTNShardsReported(ctx context.Context) { 658 for { 659 select { 660 case <-ctx.Done(): 661 assert.FailNow( 662 c.t, 663 "terminated when waiting for all tn shards reported", 664 "error: %s", ctx.Err(), 665 ) 666 default: 667 time.Sleep(defaultWaitInterval) 668 669 state := c.getClusterState() 670 if state == nil { 671 continue 672 } 673 674 expected := ParseExpectedTNShardCount(state.ClusterInfo) 675 reported := ParseReportedTNShardCount( 676 state.TNState, c.GetHAKeeperConfig(), state.Tick, 677 ) 678 679 // FIXME: what about reported larger than expected 680 if reported >= expected { 681 return 682 } 683 } 684 } 685 } 686 687 func (c *testCluster) WaitLogShardsReported(ctx context.Context) { 688 for { 689 select { 690 case <-ctx.Done(): 691 assert.FailNow( 692 c.t, 693 "terminated when waiting for all log shards reported", 694 "error: %s", ctx.Err(), 695 ) 696 default: 697 time.Sleep(defaultWaitInterval) 698 699 state := c.getClusterState() 700 if state == nil { 701 continue 702 } 703 704 expected := ParseExpectedLogShardCount(state.ClusterInfo) 705 reported := ParseReportedLogShardCount( 706 state.LogState, c.GetHAKeeperConfig(), state.Tick, 707 ) 708 // FIXME: what about reported larger than expected 709 if reported >= expected { 710 return 711 } 712 } 713 } 714 } 715 716 func (c *testCluster) WaitTNReplicaReported(ctx context.Context, shardID uint64) { 717 for { 718 select { 719 case <-ctx.Done(): 720 assert.FailNow( 721 c.t, 722 "terminated when waiting replica of tn shard reported", 723 "shard %d, error: %s", shardID, ctx.Err(), 724 ) 725 default: 726 time.Sleep(defaultWaitInterval) 727 728 state := c.getClusterState() 729 if state == nil { 730 continue 731 } 732 733 reported := ParseTNShardReportedSize( 734 shardID, state.TNState, c.GetHAKeeperConfig(), state.Tick, 735 ) 736 if reported >= TNShardExpectedSize { 737 return 738 } 739 } 740 } 741 } 742 743 func (c *testCluster) WaitLogReplicaReported(ctx context.Context, shardID uint64) { 744 for { 745 select { 746 case <-ctx.Done(): 747 assert.FailNow( 748 c.t, 749 "terminated when waiting replica of log shard reported", 750 "shard %d, error: %s", shardID, ctx.Err(), 751 ) 752 default: 753 time.Sleep(defaultWaitInterval) 754 755 state := c.getClusterState() 756 if state == nil { 757 continue 758 } 759 760 expected := ParseLogShardExpectedSize(shardID, state.ClusterInfo) 761 reported := ParseLogShardReportedSize( 762 shardID, state.LogState, c.GetHAKeeperConfig(), state.Tick, 763 ) 764 if reported >= expected { 765 return 766 } 767 } 768 } 769 } 770 771 func (c *testCluster) WaitTNStoreTimeout(ctx context.Context, uuid string) { 772 for { 773 select { 774 case <-ctx.Done(): 775 assert.FailNow( 776 c.t, 777 "terminated when waiting tn store timeout", 778 "dn store %s, error: %s", uuid, ctx.Err(), 779 ) 780 default: 781 time.Sleep(defaultWaitInterval) 782 783 expired, err := c.TNStoreExpired(uuid) 784 if err != nil { 785 c.logger.Error("fail to check tn store expired or not", 786 zap.Error(err), 787 zap.String("uuid", uuid), 788 ) 789 continue 790 } 791 792 if expired { 793 return 794 } 795 } 796 } 797 } 798 799 func (c *testCluster) WaitTNStoreTimeoutIndexed(ctx context.Context, index int) { 800 ds, err := c.GetTNServiceIndexed(index) 801 require.NoError(c.t, err) 802 803 c.WaitTNStoreTimeout(ctx, ds.ID()) 804 } 805 806 func (c *testCluster) WaitTNStoreReported(ctx context.Context, uuid string) { 807 for { 808 select { 809 case <-ctx.Done(): 810 assert.FailNow( 811 c.t, 812 "terminated when waiting tn store reported", 813 "dn store %s, error: %s", uuid, ctx.Err(), 814 ) 815 default: 816 time.Sleep(defaultWaitInterval) 817 818 expired, err := c.TNStoreExpired(uuid) 819 if err != nil { 820 c.logger.Error("fail to check tn store expired or not", 821 zap.Error(err), 822 zap.String("uuid", uuid), 823 ) 824 continue 825 } 826 827 if !expired { 828 return 829 } 830 } 831 } 832 } 833 834 func (c *testCluster) WaitTNStoreReportedIndexed(ctx context.Context, index int) { 835 ds, err := c.GetTNServiceIndexed(index) 836 require.NoError(c.t, err) 837 838 c.WaitTNStoreReported(ctx, ds.ID()) 839 } 840 841 func (c *testCluster) WaitCNStoreReported(ctx context.Context, uuid string) { 842 for { 843 select { 844 case <-ctx.Done(): 845 assert.FailNow( 846 c.t, 847 "terminated when waiting cn store reported", 848 "cn store %s, error: %s", uuid, ctx.Err(), 849 ) 850 default: 851 time.Sleep(defaultWaitInterval) 852 853 expired, err := c.CNStoreExpired(uuid) 854 if err != nil { 855 c.logger.Error("fail to check cn store expired or not", 856 zap.Error(err), 857 zap.String("uuid", uuid), 858 ) 859 continue 860 } 861 862 if !expired { 863 return 864 } 865 } 866 } 867 } 868 869 func (c *testCluster) WaitCNStoreReportedIndexed(ctx context.Context, index int) { 870 ds, err := c.GetCNServiceIndexed(index) 871 require.NoError(c.t, err) 872 873 c.WaitCNStoreReported(ctx, ds.ID()) 874 } 875 876 func (c *testCluster) WaitCNStoreTaskServiceCreated(ctx context.Context, uuid string) { 877 ds, err := c.GetCNService(uuid) 878 require.NoError(c.t, err) 879 880 for { 881 select { 882 case <-ctx.Done(): 883 assert.FailNow( 884 c.t, 885 "terminated when waiting task service created on cn store", 886 "cn store %s, error: %s", uuid, ctx.Err(), 887 ) 888 default: 889 _, ok := ds.GetTaskService() 890 if ok { 891 return 892 } 893 time.Sleep(defaultWaitInterval) 894 } 895 } 896 } 897 898 func (c *testCluster) WaitCNStoreTaskServiceCreatedIndexed(ctx context.Context, index int) { 899 ds, err := c.GetCNServiceIndexed(index) 900 require.NoError(c.t, err) 901 c.WaitCNStoreTaskServiceCreated(ctx, ds.ID()) 902 } 903 904 func (c *testCluster) WaitTNStoreTaskServiceCreated(ctx context.Context, uuid string) { 905 ds, err := c.GetTNService(uuid) 906 require.NoError(c.t, err) 907 908 for { 909 select { 910 case <-ctx.Done(): 911 assert.FailNow( 912 c.t, 913 "terminated when waiting task service created on tn store", 914 "dn store %s, error: %s", uuid, ctx.Err(), 915 ) 916 default: 917 _, ok := ds.GetTaskService() 918 if ok { 919 return 920 } 921 time.Sleep(defaultWaitInterval) 922 } 923 } 924 } 925 926 func (c *testCluster) WaitTNStoreTaskServiceCreatedIndexed(ctx context.Context, index int) { 927 ds, err := c.GetTNServiceIndexed(index) 928 require.NoError(c.t, err) 929 c.WaitTNStoreTaskServiceCreated(ctx, ds.ID()) 930 } 931 932 func (c *testCluster) WaitLogStoreTaskServiceCreated(ctx context.Context, uuid string) { 933 ls, err := c.GetLogService(uuid) 934 require.NoError(c.t, err) 935 936 for { 937 select { 938 case <-ctx.Done(): 939 assert.FailNow( 940 c.t, 941 "terminated when waiting task service created on log store", 942 "log store %s, error: %s", uuid, ctx.Err(), 943 ) 944 default: 945 _, ok := ls.GetTaskService() 946 if ok { 947 return 948 } 949 time.Sleep(defaultWaitInterval) 950 } 951 } 952 } 953 954 func (c *testCluster) WaitLogStoreTaskServiceCreatedIndexed(ctx context.Context, index int) { 955 ds, err := c.GetLogServiceIndexed(index) 956 require.NoError(c.t, err) 957 c.WaitLogStoreTaskServiceCreated(ctx, ds.ID()) 958 } 959 960 func (c *testCluster) WaitLogStoreTimeout(ctx context.Context, uuid string) { 961 for { 962 select { 963 case <-ctx.Done(): 964 assert.FailNow( 965 c.t, 966 "terminated when waiting log store timeout", 967 "log store %s, error: %s", uuid, ctx.Err(), 968 ) 969 default: 970 time.Sleep(defaultWaitInterval) 971 972 expired, err := c.LogStoreExpired(uuid) 973 if err != nil { 974 c.logger.Error("fail to check log store expired or not", 975 zap.Error(err), 976 zap.String("uuid", uuid), 977 ) 978 continue 979 } 980 981 if expired { 982 return 983 } 984 } 985 } 986 } 987 988 func (c *testCluster) WaitLogStoreTimeoutIndexed(ctx context.Context, index int) { 989 ls, err := c.GetLogServiceIndexed(index) 990 require.NoError(c.t, err) 991 992 c.WaitLogStoreTimeout(ctx, ls.ID()) 993 } 994 995 func (c *testCluster) WaitLogStoreReported(ctx context.Context, uuid string) { 996 for { 997 select { 998 case <-ctx.Done(): 999 assert.FailNow( 1000 c.t, 1001 "terminated when waiting log store reported", 1002 "log store %s, error: %s", uuid, ctx.Err(), 1003 ) 1004 default: 1005 time.Sleep(defaultWaitInterval) 1006 1007 expired, err := c.LogStoreExpired(uuid) 1008 if err != nil { 1009 c.logger.Error("fail to check log store expired or not", 1010 zap.Error(err), 1011 zap.String("uuid", uuid), 1012 ) 1013 continue 1014 } 1015 1016 if !expired { 1017 return 1018 } 1019 } 1020 } 1021 } 1022 1023 func (c *testCluster) WaitLogStoreReportedIndexed(ctx context.Context, index int) { 1024 ls, err := c.GetLogServiceIndexed(index) 1025 require.NoError(c.t, err) 1026 1027 c.WaitLogStoreReported(ctx, ls.ID()) 1028 } 1029 1030 // -------------------------------------------------------------- 1031 // The following are implements for interface `ClusterAwareness`. 1032 // -------------------------------------------------------------- 1033 func (c *testCluster) ListTNServices() []string { 1034 ids := make([]string, 0, len(c.tn.svcs)) 1035 for _, cfg := range c.tn.cfgs { 1036 ids = append(ids, cfg.UUID) 1037 } 1038 return ids 1039 } 1040 1041 func (c *testCluster) ListLogServices() []string { 1042 ids := make([]string, 0, len(c.log.svcs)) 1043 for _, svc := range c.log.svcs { 1044 ids = append(ids, svc.ID()) 1045 } 1046 return ids 1047 } 1048 1049 func (c *testCluster) ListCnServices() []string { 1050 ids := make([]string, 0, len(c.cn.svcs)) 1051 for _, svc := range c.cn.svcs { 1052 ids = append(ids, svc.ID()) 1053 } 1054 return ids 1055 } 1056 1057 func (c *testCluster) ListHAKeeperServices() []LogService { 1058 return c.selectHAkeeperServices() 1059 } 1060 1061 func (c *testCluster) GetTNService(uuid string) (TNService, error) { 1062 c.tn.Lock() 1063 defer c.tn.Unlock() 1064 1065 for i, cfg := range c.tn.cfgs { 1066 if cfg.UUID == uuid { 1067 return c.tn.svcs[i], nil 1068 } 1069 } 1070 return nil, moerr.NewNoServiceNoCtx(uuid) 1071 } 1072 1073 func (c *testCluster) GetLogService(uuid string) (LogService, error) { 1074 c.log.Lock() 1075 defer c.log.Unlock() 1076 1077 for _, svc := range c.log.svcs { 1078 if svc.ID() == uuid { 1079 return svc, nil 1080 } 1081 } 1082 return nil, moerr.NewNoServiceNoCtx(uuid) 1083 } 1084 1085 func (c *testCluster) GetCNService(uuid string) (CNService, error) { 1086 c.log.Lock() 1087 defer c.log.Unlock() 1088 1089 for _, svc := range c.cn.svcs { 1090 if svc.ID() == uuid { 1091 return svc, nil 1092 } 1093 } 1094 return nil, moerr.NewNoServiceNoCtx(uuid) 1095 } 1096 1097 func (c *testCluster) GetTNServiceIndexed(index int) (TNService, error) { 1098 c.tn.Lock() 1099 defer c.tn.Unlock() 1100 1101 if index >= len(c.tn.svcs) || index < 0 { 1102 return nil, moerr.NewInvalidServiceIndexNoCtx(index) 1103 } 1104 return c.tn.svcs[index], nil 1105 } 1106 1107 func (c *testCluster) GetLogServiceIndexed(index int) (LogService, error) { 1108 c.log.Lock() 1109 defer c.log.Unlock() 1110 1111 if index >= len(c.log.svcs) || index < 0 { 1112 return nil, moerr.NewInvalidServiceIndexNoCtx(index) 1113 } 1114 return c.log.svcs[index], nil 1115 } 1116 1117 func (c *testCluster) GetCNServiceIndexed(index int) (CNService, error) { 1118 c.log.Lock() 1119 defer c.log.Unlock() 1120 1121 if index >= len(c.cn.svcs) || index < 0 { 1122 return nil, moerr.NewInvalidServiceIndexNoCtx(index) 1123 } 1124 return c.cn.svcs[index], nil 1125 } 1126 1127 // NB: we could also fetch cluster state from non-leader hakeeper. 1128 func (c *testCluster) GetClusterState( 1129 ctx context.Context, 1130 ) (*logpb.CheckerState, error) { 1131 c.WaitHAKeeperState(ctx, logpb.HAKeeperRunning) 1132 leader := c.WaitHAKeeperLeader(ctx) 1133 return leader.GetClusterState() 1134 } 1135 1136 // -------------------------------------------------------------- 1137 // The following are implements for interface `ClusterOperation`. 1138 // -------------------------------------------------------------- 1139 func (c *testCluster) CloseTNService(uuid string) error { 1140 ds, err := c.GetTNService(uuid) 1141 if err != nil { 1142 return err 1143 } 1144 return ds.Close() 1145 } 1146 1147 func (c *testCluster) StartTNService(uuid string) error { 1148 ds, err := c.GetTNService(uuid) 1149 if err != nil { 1150 return err 1151 } 1152 return ds.Start() 1153 } 1154 1155 func (c *testCluster) CloseTNServiceIndexed(index int) error { 1156 ds, err := c.GetTNServiceIndexed(index) 1157 if err != nil { 1158 return err 1159 } 1160 return ds.Close() 1161 } 1162 1163 func (c *testCluster) StartTNServiceIndexed(index int) error { 1164 ds, err := c.GetTNServiceIndexed(index) 1165 if err != nil { 1166 return err 1167 } 1168 return ds.Start() 1169 } 1170 1171 func (c *testCluster) CloseLogService(uuid string) error { 1172 ls, err := c.GetLogService(uuid) 1173 if err != nil { 1174 return err 1175 } 1176 return ls.Close() 1177 } 1178 1179 func (c *testCluster) StartLogService(uuid string) error { 1180 ls, err := c.GetLogService(uuid) 1181 if err != nil { 1182 return err 1183 } 1184 return ls.Start() 1185 } 1186 1187 func (c *testCluster) CloseLogServiceIndexed(index int) error { 1188 ls, err := c.GetLogServiceIndexed(index) 1189 if err != nil { 1190 return err 1191 } 1192 return ls.Close() 1193 } 1194 1195 func (c *testCluster) StartLogServiceIndexed(index int) error { 1196 ls, err := c.GetLogServiceIndexed(index) 1197 if err != nil { 1198 return err 1199 } 1200 return ls.Start() 1201 } 1202 1203 func (c *testCluster) CloseCNService(uuid string) error { 1204 cs, err := c.GetCNService(uuid) 1205 if err != nil { 1206 return err 1207 } 1208 return cs.Close() 1209 } 1210 1211 func (c *testCluster) StartCNService(uuid string) error { 1212 cs, err := c.GetCNService(uuid) 1213 if err != nil { 1214 return err 1215 } 1216 return cs.Start() 1217 } 1218 1219 func (c *testCluster) CloseCNServiceIndexed(index int) error { 1220 cs, err := c.GetCNServiceIndexed(index) 1221 if err != nil { 1222 return err 1223 } 1224 return cs.Close() 1225 } 1226 1227 func (c *testCluster) StartCNServiceIndexed(index int) error { 1228 cs, err := c.GetCNServiceIndexed(index) 1229 if err != nil { 1230 return err 1231 } 1232 return cs.Start() 1233 } 1234 1235 func (c *testCluster) StartCNServices(n int) error { 1236 offset := len(c.cn.svcs) 1237 c.buildCNConfigs(n) 1238 c.initCNServices(c.fileservices, offset) 1239 1240 for _, cs := range c.cn.svcs[offset:] { 1241 if err := cs.Start(); err != nil { 1242 return err 1243 } 1244 } 1245 return nil 1246 } 1247 1248 func (c *testCluster) NewNetworkPartition( 1249 tnIndexes, logIndexes, cnIndexes []uint32, 1250 ) NetworkPartition { 1251 return newNetworkPartition( 1252 c.opt.initial.logServiceNum, logIndexes, 1253 c.opt.initial.tnServiceNum, tnIndexes, 1254 c.opt.initial.cnServiceNum, cnIndexes, 1255 ) 1256 } 1257 1258 func (c *testCluster) RemainingNetworkPartition( 1259 partitions ...NetworkPartition, 1260 ) NetworkPartition { 1261 return remainingNetworkPartition(c.opt.initial.logServiceNum, c.opt.initial.tnServiceNum, 0, partitions...) 1262 } 1263 1264 func (c *testCluster) StartNetworkPartition(parts ...NetworkPartition) { 1265 c.network.Lock() 1266 defer c.network.Unlock() 1267 1268 addressSets := c.network.addresses.buildPartitionAddressSets(parts...) 1269 c.network.addressSets = addressSets 1270 } 1271 1272 func (c *testCluster) CloseNetworkPartition() { 1273 c.network.Lock() 1274 defer c.network.Unlock() 1275 1276 c.network.addressSets = nil 1277 } 1278 1279 // ------------------------------------------------------ 1280 // The following are private utilities for `testCluster`. 1281 // ------------------------------------------------------ 1282 1283 // buildServiceAddresses builds addresses for all services. 1284 func (c *testCluster) buildServiceAddresses() *serviceAddresses { 1285 return newServiceAddresses( 1286 c.t, 1287 c.opt.initial.logServiceNum, 1288 c.opt.initial.tnServiceNum, 1289 c.opt.initial.cnServiceNum, 1290 c.opt.hostAddr) 1291 } 1292 1293 // buildTNConfigs builds configurations for all tn services. 1294 func (c *testCluster) buildTNConfigs() ([]*tnservice.Config, []tnOptions) { 1295 batch := c.opt.initial.tnServiceNum 1296 1297 cfgs := make([]*tnservice.Config, 0, batch) 1298 opts := make([]tnOptions, 0, batch) 1299 for i := 0; i < batch; i++ { 1300 cfg := buildTNConfig(i, c.opt, c.network.addresses) 1301 cfgs = append(cfgs, cfg) 1302 1303 localAddr := cfg.ListenAddress 1304 opt := buildTNOptions(cfg, c.backendFilterFactory(localAddr)) 1305 opts = append(opts, opt) 1306 } 1307 return cfgs, opts 1308 } 1309 1310 // buildLogConfigs builds configurations for all log services. 1311 func (c *testCluster) buildLogConfigs() ([]logservice.Config, []logOptions) { 1312 batch := c.opt.initial.logServiceNum 1313 1314 cfgs := make([]logservice.Config, 0, batch) 1315 opts := make([]logOptions, 0, batch) 1316 for i := 0; i < batch; i++ { 1317 cfg := buildLogConfig(i, c.opt, c.network.addresses) 1318 cfgs = append(cfgs, cfg) 1319 1320 localAddr := cfg.LogServiceServiceAddr() 1321 opt := buildLogOptions(cfg, c.backendFilterFactory(localAddr)) 1322 opts = append(opts, opt) 1323 } 1324 return cfgs, opts 1325 } 1326 1327 func (c *testCluster) buildCNConfigs(n int) { 1328 offset := len(c.cn.opts) 1329 batch := n 1330 c.network.addresses.buildCNAddress(c.t, batch, c.opt.hostAddr) 1331 for i := 0; i < batch; i++ { 1332 cfg := buildCNConfig(i+offset, c.opt, c.network.addresses) 1333 c.cn.cfgs = append(c.cn.cfgs, cfg) 1334 var opt cnOptions 1335 if c.opt.cn.optionFunc != nil { 1336 opt = c.opt.cn.optionFunc(i + offset) 1337 } 1338 opt = append(opt, cnservice.WithLogger(c.logger)) 1339 c.cn.opts = append(c.cn.opts, opt) 1340 1341 c.fileservices.cnLocalFSs = append(c.fileservices.cnLocalFSs, 1342 c.createFS(context.Background(), filepath.Join(c.opt.rootDataDir, cfg.UUID), defines.LocalFileServiceName)) 1343 c.fileservices.cnServiceNum++ 1344 } 1345 } 1346 1347 // initTNServices builds all tn services. 1348 // 1349 // Before initializing tn service, log service must be started already. 1350 func (c *testCluster) initTNServices(fileservices *fileServices) []TNService { 1351 batch := c.opt.initial.tnServiceNum 1352 1353 c.logger.Info("initialize tn services", zap.Int("batch", batch)) 1354 1355 svcs := make([]TNService, 0, batch) 1356 for i := 0; i < batch; i++ { 1357 cfg := c.tn.cfgs[i] 1358 opt := c.tn.opts[i] 1359 fs, err := fileservice.NewFileServices( 1360 "", 1361 fileservices.getTNLocalFileService(i), 1362 fileservices.getS3FileService(), 1363 ) 1364 if err != nil { 1365 panic(err) 1366 } 1367 ds, err := newTNService( 1368 cfg, 1369 c.newRuntime(), 1370 fs, 1371 opt) 1372 require.NoError(c.t, err) 1373 1374 c.logger.Info( 1375 "dn service initialized", 1376 zap.Int("index", i), 1377 zap.Any("config", cfg), 1378 ) 1379 1380 svcs = append(svcs, ds) 1381 } 1382 1383 return svcs 1384 } 1385 1386 // initLogServices builds all log services. 1387 func (c *testCluster) initLogServices() []LogService { 1388 batch := c.opt.initial.logServiceNum 1389 1390 c.logger.Info("initialize log services", zap.Int("batch", batch)) 1391 1392 svcs := make([]LogService, 0, batch) 1393 for i := 0; i < batch; i++ { 1394 cfg := c.log.cfgs[i] 1395 opt := c.log.opts[i] 1396 ls, err := newLogService(cfg, testutil.NewFS(), opt) 1397 require.NoError(c.t, err) 1398 1399 c.logger.Info( 1400 "log service initialized", 1401 zap.Int("index", i), 1402 zap.Any("config", cfg), 1403 ) 1404 1405 svcs = append(svcs, ls) 1406 } 1407 return svcs 1408 } 1409 1410 func (c *testCluster) initCNServices( 1411 fileservices *fileServices, 1412 offset int) { 1413 batch := len(c.cn.cfgs) 1414 1415 c.logger.Info("initialize cn services", zap.Int("batch", batch)) 1416 for i := offset; i < batch; i++ { 1417 cfg := c.cn.cfgs[i] 1418 opt := c.cn.opts[i] 1419 fs, err := fileservice.NewFileServices( 1420 "", 1421 fileservices.getCNLocalFileService(i), 1422 fileservices.getS3FileService(), 1423 fileservices.getETLFileService(), 1424 ) 1425 if err != nil { 1426 panic(err) 1427 } 1428 ctx, cancel := context.WithCancel(context.Background()) 1429 cs, err := newCNService(cfg, ctx, fs, opt) 1430 if err != nil { 1431 panic(err) 1432 } 1433 cs.SetCancel(cancel) 1434 1435 c.logger.Info( 1436 "cn service initialized", 1437 zap.Int("index", i), 1438 zap.Any("config", cfg), 1439 ) 1440 1441 c.cn.svcs = append(c.cn.svcs, cs) 1442 } 1443 } 1444 1445 // startTNServices initializes and starts all tn services. 1446 func (c *testCluster) startTNServices(ctx context.Context) error { 1447 // initialize all tn services 1448 c.tn.svcs = c.initTNServices(c.fileservices) 1449 1450 // start tn services 1451 for _, ds := range c.tn.svcs { 1452 if err := ds.Start(); err != nil { 1453 return err 1454 } 1455 } 1456 1457 c.WaitTNShardsReported(ctx) 1458 return nil 1459 } 1460 1461 // startLogServices initializes and starts all log services. 1462 func (c *testCluster) startLogServices(ctx context.Context) error { 1463 // initialize all log service 1464 c.log.svcs = c.initLogServices() 1465 1466 // start log services 1467 for _, ls := range c.log.svcs { 1468 if err := ls.Start(); err != nil { 1469 return err 1470 } 1471 } 1472 1473 // start hakeeper replicas 1474 if err := c.startHAKeeperReplica(); err != nil { 1475 return err 1476 } 1477 1478 // initialize cluster information 1479 if err := c.setInitialClusterInfo(); err != nil { 1480 return err 1481 } 1482 1483 c.WaitHAKeeperState(ctx, logpb.HAKeeperRunning) 1484 return nil 1485 } 1486 1487 func (c *testCluster) startCNServices(ctx context.Context) error { 1488 c.initCNServices(c.fileservices, 0) 1489 1490 for _, cs := range c.cn.svcs { 1491 if err := cs.Start(); err != nil { 1492 return err 1493 } 1494 } 1495 1496 return nil 1497 } 1498 1499 // closeTNServices closes all tn services. 1500 func (c *testCluster) closeTNServices() error { 1501 c.logger.Info("start to close tn services") 1502 1503 for i, ds := range c.tn.svcs { 1504 c.logger.Info("close tn service", zap.Int("index", i)) 1505 if err := ds.Close(); err != nil { 1506 return err 1507 } 1508 c.logger.Info("dn service closed", zap.Int("index", i)) 1509 } 1510 1511 return nil 1512 } 1513 1514 // closeLogServices closes all log services. 1515 func (c *testCluster) closeLogServices() error { 1516 defer logutil.LogClose(c.logger, "tests-framework/logservices")() 1517 1518 for i, ls := range c.log.svcs { 1519 c.logger.Info("close log service", zap.Int("index", i)) 1520 if err := ls.Close(); err != nil { 1521 return err 1522 } 1523 c.logger.Info("log service closed", zap.Int("index", i)) 1524 } 1525 1526 return nil 1527 } 1528 1529 func (c *testCluster) closeCNServices() error { 1530 defer logutil.LogClose(c.logger, "tests-framework/cnservices")() 1531 1532 for i, cs := range c.cn.svcs { 1533 c.logger.Info("close cn service", zap.Int("index", i)) 1534 if err := cs.Close(); err != nil { 1535 return err 1536 } 1537 c.logger.Info("cn service closed", zap.Int("index", i)) 1538 } 1539 1540 return nil 1541 } 1542 1543 // getClusterState fetches cluster state from arbitrary hakeeper. 1544 // 1545 // NB: it's possible that getClusterState returns nil value. 1546 func (c *testCluster) getClusterState() *logpb.CheckerState { 1547 var state *logpb.CheckerState 1548 fn := func(index int, svc LogService) bool { 1549 s, err := svc.GetClusterState() 1550 if err != nil { 1551 c.logger.Error( 1552 "fail to get cluster state", 1553 zap.Error(err), 1554 zap.Int("index", index), 1555 ) 1556 return false 1557 } 1558 state = s 1559 // XXX MPOOL 1560 // Too much logging can break CI. 1561 // c.logger.Info("current cluster state", zap.Any("state", s)) 1562 return true 1563 } 1564 c.rangeHAKeeperService(fn) 1565 return state 1566 } 1567 1568 // getHAKeeperLeader gets log service which is hakeeper leader. 1569 func (c *testCluster) getHAKeeperLeader() LogService { 1570 var leader LogService 1571 fn := func(index int, svc LogService) bool { 1572 isLeader, err := svc.IsLeaderHakeeper() 1573 if err != nil { 1574 c.logger.Error( 1575 "fail to check hakeeper", 1576 zap.Error(err), 1577 zap.Int("index", index), 1578 ) 1579 return false 1580 } 1581 c.logger.Info( 1582 "hakeeper state", 1583 zap.Bool("isLeader", isLeader), 1584 zap.Int("index", index), 1585 ) 1586 1587 if isLeader { 1588 leader = svc 1589 return true 1590 } 1591 1592 return false 1593 } 1594 c.rangeHAKeeperService(fn) 1595 return leader 1596 } 1597 1598 // rangeHAKeeperService iterates all hakeeper service until `fn` returns true. 1599 func (c *testCluster) rangeHAKeeperService( 1600 fn func(index int, svc LogService) bool, 1601 ) { 1602 for i, svc := range c.selectHAkeeperServices() { 1603 index := i 1604 1605 if svc.Status() != ServiceStarted { 1606 c.logger.Warn( 1607 "hakeeper service not started", 1608 zap.Int("index", index), 1609 ) 1610 continue 1611 } 1612 1613 if fn(index, svc) { 1614 break 1615 } 1616 } 1617 } 1618 1619 func (c *testCluster) newRuntime() runtime.Runtime { 1620 return runtime.NewRuntime(metadata.ServiceType_CN, "", c.logger, runtime.WithClock(c.clock)) 1621 } 1622 1623 // FilterFunc returns true if traffic was allowed. 1624 type FilterFunc func(morpc.Message, string) bool 1625 1626 // backendFilterFactory constructs a closure with the type of FilterFunc. 1627 func (c *testCluster) backendFilterFactory(localAddr string) FilterFunc { 1628 return func(_ morpc.Message, backendAddr string) bool { 1629 // NB: it's possible that partition takes effect once more after disabled. 1630 c.network.RLock() 1631 addressSets := c.network.addressSets 1632 c.network.RUnlock() 1633 1634 if len(addressSets) == 0 { 1635 return true 1636 } 1637 1638 for _, addrSet := range addressSets { 1639 if addrSet.contains(localAddr) && 1640 addrSet.contains(backendAddr) { 1641 return true 1642 } 1643 } 1644 1645 c.logger.Info( 1646 "traffic not allowed", 1647 zap.String("local", localAddr), 1648 zap.String("backend", backendAddr), 1649 ) 1650 1651 return false 1652 } 1653 }