github.com/matrixorigin/matrixone@v0.7.0/pkg/tests/service/service.go (about) 1 // Copyright 2021 - 2022 Matrix Origin 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package service 16 17 import ( 18 "context" 19 "os" 20 "path/filepath" 21 "sync" 22 "testing" 23 "time" 24 25 "github.com/google/uuid" 26 "github.com/matrixorigin/matrixone/pkg/cnservice" 27 "github.com/matrixorigin/matrixone/pkg/common/moerr" 28 "github.com/matrixorigin/matrixone/pkg/common/morpc" 29 "github.com/matrixorigin/matrixone/pkg/common/runtime" 30 "github.com/matrixorigin/matrixone/pkg/common/stopper" 31 "github.com/matrixorigin/matrixone/pkg/defines" 32 "github.com/matrixorigin/matrixone/pkg/dnservice" 33 "github.com/matrixorigin/matrixone/pkg/fileservice" 34 "github.com/matrixorigin/matrixone/pkg/hakeeper" 35 "github.com/matrixorigin/matrixone/pkg/hakeeper/checkers/syshealth" 36 "github.com/matrixorigin/matrixone/pkg/logservice" 37 "github.com/matrixorigin/matrixone/pkg/logutil" 38 logpb "github.com/matrixorigin/matrixone/pkg/pb/logservice" 39 "github.com/matrixorigin/matrixone/pkg/pb/metadata" 40 "github.com/matrixorigin/matrixone/pkg/testutil" 41 "github.com/matrixorigin/matrixone/pkg/txn/clock" 42 "github.com/stretchr/testify/assert" 43 "github.com/stretchr/testify/require" 44 "go.uber.org/zap" 45 ) 46 47 var ( 48 defaultWaitInterval = 100 * time.Millisecond 49 defaultTestTimeout = 3 * time.Minute 50 ) 51 52 // Cluster describes behavior of test framework. 53 type Cluster interface { 54 // Start starts svcs sequentially, after start, system init is completed. 55 Start() error 56 // Close stops svcs sequentially 57 Close() error 58 // Options returns the adjusted options 59 Options() Options 60 61 ClusterOperation 62 ClusterAwareness 63 ClusterState 64 ClusterWaitState 65 } 66 67 // ClusterOperation supports kinds of cluster operations. 68 type ClusterOperation interface { 69 // CloseDNService closes dn service by uuid. 70 CloseDNService(uuid string) error 71 // StartDNService starts dn service by uuid. 72 StartDNService(uuid string) error 73 74 // CloseDNServiceIndexed closes dn service by its index. 75 CloseDNServiceIndexed(index int) error 76 // StartDNServiceIndexed starts dn service by its index. 77 StartDNServiceIndexed(index int) error 78 79 // CloseLogService closes log service by uuid. 80 CloseLogService(uuid string) error 81 // StartLogService starts log service by uuid. 82 StartLogService(uuid string) error 83 84 // CloseLogServiceIndexed closes log service by its index. 85 CloseLogServiceIndexed(index int) error 86 // StartLogServiceIndexed starts log service by its index. 87 StartLogServiceIndexed(index int) error 88 89 // CloseCNService closes cn service by uuid. 90 CloseCNService(uuid string) error 91 // StartCNService starts cn service by uuid. 92 StartCNService(uuid string) error 93 94 // CloseCNServiceIndexed closes cn service by its index. 95 CloseCNServiceIndexed(index int) error 96 // StartCNServiceIndexed starts cn service by its index. 97 StartCNServiceIndexed(index int) error 98 99 // NewNetworkPartition constructs network partition from service index. 100 NewNetworkPartition(dnIndexes, logIndexes, cnIndexes []uint32) NetworkPartition 101 // RemainingNetworkPartition returns partition for the remaining services. 102 RemainingNetworkPartition(partitions ...NetworkPartition) NetworkPartition 103 // StartNetworkPartition enables network partition feature. 104 StartNetworkPartition(partitions ...NetworkPartition) 105 // CloseNetworkPartition disables network partition feature. 106 CloseNetworkPartition() 107 } 108 109 // ClusterAwareness provides cluster awareness information. 110 type ClusterAwareness interface { 111 // ListDNServices lists uuid of all dn services. 112 ListDNServices() []string 113 // ListLogServices lists uuid of all log services. 114 ListLogServices() []string 115 // ListCnServices lists uuid of all cn services. 116 ListCnServices() []string 117 // ListHAKeeperServices lists all hakeeper log services. 118 ListHAKeeperServices() []LogService 119 120 // GetDNService fetches dn service instance by uuid. 121 GetDNService(uuid string) (DNService, error) 122 // GetLogService fetches log service instance by index. 123 GetLogService(uuid string) (LogService, error) 124 // GetDNServiceIndexed fetches dn service instance by uuid. 125 GetDNServiceIndexed(index int) (DNService, error) 126 // GetLogServiceIndexed fetches log service instance by index. 127 GetLogServiceIndexed(index int) (LogService, error) 128 // GetCNService fetches cn service instance by index. 129 GetCNService(uuid string) (CNService, error) 130 // GetCNServiceIndexed fetches cn service instance by index. 131 GetCNServiceIndexed(index int) (CNService, error) 132 133 // GetClusterState fetches current cluster state 134 GetClusterState(ctx context.Context) (*logpb.CheckerState, error) 135 } 136 137 // ClusterState provides cluster running state. 138 type ClusterState interface { 139 // ListDNShards lists all dn shards within the cluster. 140 ListDNShards(ctx context.Context) ([]metadata.DNShardRecord, error) 141 // ListLogShards lists all log shards within the cluster. 142 ListLogShards(ctx context.Context) ([]metadata.LogShardRecord, error) 143 144 // GetDNStoreInfo gets dn store information by uuid. 145 GetDNStoreInfo(ctx context.Context, uuid string) (logpb.DNStoreInfo, error) 146 // GetDNStoreInfoIndexed gets dn store information by index. 147 GetDNStoreInfoIndexed(ctx context.Context, index int) (logpb.DNStoreInfo, error) 148 149 // GetLogStoreInfo gets log store information by uuid. 150 GetLogStoreInfo(ctx context.Context, uuid string) (logpb.LogStoreInfo, error) 151 // GetLogStoreInfoIndexed gets log store information by index. 152 GetLogStoreInfoIndexed(ctx context.Context, index int) (logpb.LogStoreInfo, error) 153 154 // GetCNStoreInfo gets cn store information by uuid. 155 GetCNStoreInfo(ctx context.Context, uuid string) (logpb.CNStoreInfo, error) 156 // GetCNStoreInfoIndexed gets cn store information by index. 157 GetCNStoreInfoIndexed(ctx context.Context, index int) (logpb.CNStoreInfo, error) 158 159 // GetHAKeeperState returns hakeeper state from running hakeeper. 160 GetHAKeeperState() logpb.HAKeeperState 161 // GetHAKeeperConfig returns hakeeper configuration. 162 GetHAKeeperConfig() hakeeper.Config 163 164 // DNStoreExpired checks dn store expired or not by uuid. 165 DNStoreExpired(uuid string) (bool, error) 166 // DNStoreExpiredIndexed checks dn store expired or not by index. 167 DNStoreExpiredIndexed(index int) (bool, error) 168 // LogStoreExpired checks log store expired or not by uuid. 169 LogStoreExpired(uuid string) (bool, error) 170 // LogStoreExpiredIndexed checks log store expired or not by index. 171 LogStoreExpiredIndexed(index int) (bool, error) 172 // CNStoreExpired checks cn store expired or not by uuid. 173 CNStoreExpired(uuid string) (bool, error) 174 // CNStoreExpiredIndexed checks cn store expired or not by index. 175 CNStoreExpiredIndexed(index int) (bool, error) 176 177 // IsClusterHealthy checks whether cluster is healthy or not. 178 IsClusterHealthy() bool 179 } 180 181 // ClusterWaitState waits cluster state until timeout. 182 type ClusterWaitState interface { 183 // WaitHAKeeperLeader waits hakeeper leader elected and return it. 184 WaitHAKeeperLeader(ctx context.Context) LogService 185 // WaitHAKeeperState waits the specific hakeeper state. 186 WaitHAKeeperState(ctx context.Context, expected logpb.HAKeeperState) 187 188 // WaitDNShardsReported waits the expected count of dn shards reported. 189 WaitDNShardsReported(ctx context.Context) 190 // WaitLogShardsReported waits the expected count of log shards reported. 191 WaitLogShardsReported(ctx context.Context) 192 // WaitDNReplicaReported waits dn replica reported. 193 WaitDNReplicaReported(ctx context.Context, shardID uint64) 194 // WaitLogReplicaReported waits log replicas reported. 195 WaitLogReplicaReported(ctx context.Context, shardID uint64) 196 197 // WaitDNStoreTimeout waits dn store timeout by uuid. 198 WaitDNStoreTimeout(ctx context.Context, uuid string) 199 // WaitDNStoreTimeoutIndexed waits dn store timeout by index. 200 WaitDNStoreTimeoutIndexed(ctx context.Context, index int) 201 // WaitDNStoreReported waits dn store reported by uuid. 202 WaitDNStoreReported(ctx context.Context, uuid string) 203 // WaitDNStoreReportedIndexed waits dn store reported by index. 204 WaitDNStoreReportedIndexed(ctx context.Context, index int) 205 // WaitDNStoreTaskServiceCreated waits dn store task service started by uuid. 206 WaitDNStoreTaskServiceCreated(ctx context.Context, uuid string) 207 // WaitDNStoreTaskServiceCreatedIndexed waits dn store task service started by index. 208 WaitDNStoreTaskServiceCreatedIndexed(ctx context.Context, index int) 209 // WaitCNStoreReported waits cn store reported by uuid. 210 WaitCNStoreReported(ctx context.Context, uuid string) 211 // WaitCNStoreReportedIndexed waits cn store reported by index. 212 WaitCNStoreReportedIndexed(ctx context.Context, index int) 213 // WaitCNStoreTaskServiceCreated waits cn store task service started by uuid. 214 WaitCNStoreTaskServiceCreated(ctx context.Context, uuid string) 215 // WaitCNStoreTaskServiceCreatedIndexed waits cn store task service started by index. 216 WaitCNStoreTaskServiceCreatedIndexed(ctx context.Context, index int) 217 // WaitLogStoreTaskServiceCreated waits log store task service started by uuid 218 WaitLogStoreTaskServiceCreated(ctx context.Context, uuid string) 219 // WaitLogStoreTaskServiceCreatedIndexed waits log store task service started by index 220 WaitLogStoreTaskServiceCreatedIndexed(ctx context.Context, index int) 221 222 // WaitLogStoreTimeout waits log store timeout by uuid. 223 WaitLogStoreTimeout(ctx context.Context, uuid string) 224 // WaitLogStoreTimeoutIndexed waits log store timeout by index. 225 WaitLogStoreTimeoutIndexed(ctx context.Context, index int) 226 // WaitLogStoreReported waits log store reported by uuid. 227 WaitLogStoreReported(ctx context.Context, uuid string) 228 // WaitLogStoreReportedIndexed waits log store reported by index. 229 WaitLogStoreReportedIndexed(ctx context.Context, index int) 230 } 231 232 // ---------------------------------------------------- 233 // The following are implements for interface `Cluster`. 234 // ---------------------------------------------------- 235 236 // testCluster simulates a cluster with dn and log service. 237 type testCluster struct { 238 t *testing.T 239 testID string 240 opt Options 241 logger *zap.Logger 242 stopper *stopper.Stopper 243 clock clock.Clock 244 245 dn struct { 246 sync.Mutex 247 cfgs []*dnservice.Config 248 opts []dnOptions 249 svcs []DNService 250 } 251 252 log struct { 253 once sync.Once 254 255 sync.Mutex 256 cfgs []logservice.Config 257 opts []logOptions 258 svcs []LogService 259 } 260 261 cn struct { 262 sync.Mutex 263 cfgs []*cnservice.Config 264 opts []cnOptions 265 svcs []CNService 266 } 267 268 network struct { 269 addresses serviceAddresses 270 271 sync.RWMutex 272 addressSets []addressSet 273 } 274 275 fileservices *fileServices 276 277 mu struct { 278 sync.Mutex 279 running bool 280 } 281 } 282 283 // NewCluster construct a cluster for integration test. 284 func NewCluster(t *testing.T, opt Options) (Cluster, error) { 285 logutil.SetupMOLogger(&logutil.LogConfig{ 286 Level: "debug", 287 Format: "console", 288 }) 289 opt.validate() 290 291 c := &testCluster{ 292 t: t, 293 testID: uuid.New().String(), 294 opt: opt, 295 stopper: stopper.NewStopper("test-cluster"), 296 } 297 c.logger = logutil.Adjust(opt.logger).With(zap.String("testcase", t.Name())).With(zap.String("test-id", c.testID)) 298 c.opt.rootDataDir = filepath.Join(c.opt.rootDataDir, c.testID, t.Name()) 299 if c.clock == nil { 300 c.clock = clock.NewUnixNanoHLCClockWithStopper(c.stopper, 0) 301 } 302 303 // TODO: CN and LOG use process level runtime 304 runtime.SetupProcessLevelRuntime(c.newRuntime()) 305 306 // build addresses for all services 307 c.network.addresses = c.buildServiceAddresses() 308 // build log service configurations 309 c.log.cfgs, c.log.opts = c.buildLogConfigs(c.network.addresses) 310 // build dn service configurations 311 c.dn.cfgs, c.dn.opts = c.buildDNConfigs(c.network.addresses) 312 // build cn service configurations 313 c.cn.cfgs, c.cn.opts = c.buildCNConfigs(c.network.addresses) 314 // build FileService instances 315 c.fileservices = c.buildFileServices() 316 317 return c, nil 318 } 319 320 func (c *testCluster) Start() error { 321 c.mu.Lock() 322 defer c.mu.Unlock() 323 324 if c.mu.running { 325 return nil 326 } 327 328 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 329 defer cancel() 330 331 c.mu.running = true 332 // start log services first 333 if err := c.startLogServices(ctx); err != nil { 334 return err 335 } 336 337 // start dn services 338 if err := c.startDNServices(ctx); err != nil { 339 return err 340 } 341 342 // start cn services 343 if err := c.startCNServices(ctx); err != nil { 344 return err 345 } 346 347 return nil 348 } 349 350 func (c *testCluster) Options() Options { 351 return c.opt 352 } 353 354 func (c *testCluster) Close() error { 355 defer logutil.LogClose(c.logger, "tests-framework")() 356 c.logger.Info("closing testCluster") 357 358 c.mu.Lock() 359 defer c.mu.Unlock() 360 361 if !c.mu.running { 362 return nil 363 } 364 365 // close all cn services first 366 if err := c.closeCNServices(); err != nil { 367 return err 368 } 369 370 // close all dn services 371 if err := c.closeDNServices(); err != nil { 372 return err 373 } 374 375 // close all log services 376 if err := c.closeLogServices(); err != nil { 377 return err 378 } 379 380 c.mu.running = false 381 c.stopper.Stop() 382 383 if !c.opt.keepData { 384 if err := os.RemoveAll(c.opt.rootDataDir); err != nil { 385 return err 386 } 387 } 388 return nil 389 } 390 391 // ---------------------------------------------------------- 392 // The following are implements for interface `ClusterState`. 393 // ---------------------------------------------------------- 394 func (c *testCluster) ListDNShards( 395 ctx context.Context, 396 ) ([]metadata.DNShardRecord, error) { 397 state, err := c.GetClusterState(ctx) 398 if err != nil { 399 return nil, err 400 } 401 return state.ClusterInfo.DNShards, nil 402 } 403 404 func (c *testCluster) ListLogShards( 405 ctx context.Context, 406 ) ([]metadata.LogShardRecord, error) { 407 state, err := c.GetClusterState(ctx) 408 if err != nil { 409 return nil, err 410 } 411 return state.ClusterInfo.LogShards, nil 412 } 413 414 func (c *testCluster) GetDNStoreInfo( 415 ctx context.Context, uuid string, 416 ) (logpb.DNStoreInfo, error) { 417 state, err := c.GetClusterState(ctx) 418 if err != nil { 419 return logpb.DNStoreInfo{}, err 420 } 421 stores := state.DNState.Stores 422 if storeInfo, ok := stores[uuid]; ok { 423 return storeInfo, nil 424 } 425 return logpb.DNStoreInfo{}, moerr.NewNoService(ctx, uuid) 426 } 427 428 func (c *testCluster) GetDNStoreInfoIndexed( 429 ctx context.Context, index int, 430 ) (logpb.DNStoreInfo, error) { 431 ds, err := c.GetDNServiceIndexed(index) 432 if err != nil { 433 return logpb.DNStoreInfo{}, err 434 } 435 return c.GetDNStoreInfo(ctx, ds.ID()) 436 } 437 438 func (c *testCluster) GetLogStoreInfo( 439 ctx context.Context, uuid string, 440 ) (logpb.LogStoreInfo, error) { 441 state, err := c.GetClusterState(ctx) 442 if err != nil { 443 return logpb.LogStoreInfo{}, err 444 } 445 stores := state.LogState.Stores 446 if storeInfo, ok := stores[uuid]; ok { 447 return storeInfo, nil 448 } 449 return logpb.LogStoreInfo{}, moerr.NewNoService(ctx, uuid) 450 } 451 452 func (c *testCluster) GetLogStoreInfoIndexed( 453 ctx context.Context, index int, 454 ) (logpb.LogStoreInfo, error) { 455 ls, err := c.GetLogServiceIndexed(index) 456 if err != nil { 457 return logpb.LogStoreInfo{}, err 458 } 459 return c.GetLogStoreInfo(ctx, ls.ID()) 460 } 461 462 func (c *testCluster) GetCNStoreInfo(ctx context.Context, uuid string) (logpb.CNStoreInfo, error) { 463 state, err := c.GetClusterState(ctx) 464 if err != nil { 465 return logpb.CNStoreInfo{}, err 466 } 467 stores := state.CNState.Stores 468 if storeInfo, ok := stores[uuid]; ok { 469 return storeInfo, nil 470 } 471 return logpb.CNStoreInfo{}, moerr.NewNoService(ctx, uuid) 472 } 473 474 func (c *testCluster) GetCNStoreInfoIndexed(ctx context.Context, index int) (logpb.CNStoreInfo, error) { 475 ls, err := c.GetCNServiceIndexed(index) 476 if err != nil { 477 return logpb.CNStoreInfo{}, err 478 } 479 return c.GetCNStoreInfo(ctx, ls.ID()) 480 } 481 482 func (c *testCluster) GetHAKeeperState() logpb.HAKeeperState { 483 state := c.getClusterState() 484 require.NotNil(c.t, state) 485 return state.State 486 } 487 488 func (c *testCluster) GetHAKeeperConfig() hakeeper.Config { 489 return c.opt.BuildHAKeeperConfig() 490 } 491 492 func (c *testCluster) DNStoreExpired(uuid string) (bool, error) { 493 state := c.getClusterState() 494 require.NotNil(c.t, state) 495 496 dnStore, ok := state.DNState.Stores[uuid] 497 if !ok { 498 return false, moerr.NewShardNotReportedNoCtx(uuid, 0xDEADBEEF) 499 } 500 501 hkcfg := c.GetHAKeeperConfig() 502 expired := hkcfg.DNStoreExpired(dnStore.Tick, state.Tick) 503 504 c.logger.Info( 505 "check dn store expired or not", 506 zap.Any("hakeeper config", hkcfg), 507 zap.Uint64("dn store tick", dnStore.Tick), 508 zap.Uint64("current tick", state.Tick), 509 zap.Bool("expired", expired), 510 ) 511 512 return expired, nil 513 } 514 515 func (c *testCluster) DNStoreExpiredIndexed(index int) (bool, error) { 516 ds, err := c.GetDNServiceIndexed(index) 517 if err != nil { 518 return false, err 519 } 520 return c.DNStoreExpired(ds.ID()) 521 } 522 523 func (c *testCluster) LogStoreExpired(uuid string) (bool, error) { 524 state := c.getClusterState() 525 require.NotNil(c.t, state) 526 527 logStore, ok := state.LogState.Stores[uuid] 528 if !ok { 529 return false, moerr.NewShardNotReportedNoCtx(uuid, 0xDEADBEEF) 530 } 531 532 hkcfg := c.GetHAKeeperConfig() 533 expired := hkcfg.LogStoreExpired(logStore.Tick, state.Tick) 534 535 c.logger.Info( 536 "check log store expired or not", 537 zap.Any("hakeeper config", hkcfg), 538 zap.Uint64("log store tick", logStore.Tick), 539 zap.Uint64("current tick", state.Tick), 540 zap.Bool("expired", expired), 541 ) 542 543 return expired, nil 544 } 545 546 func (c *testCluster) LogStoreExpiredIndexed(index int) (bool, error) { 547 ls, err := c.GetLogServiceIndexed(index) 548 if err != nil { 549 return false, err 550 } 551 return c.LogStoreExpired(ls.ID()) 552 } 553 554 func (c *testCluster) CNStoreExpired(uuid string) (bool, error) { 555 state := c.getClusterState() 556 require.NotNil(c.t, state) 557 558 cnStore, ok := state.CNState.Stores[uuid] 559 if !ok { 560 return false, moerr.NewShardNotReportedNoCtx(uuid, 0) 561 } 562 563 hkcfg := c.GetHAKeeperConfig() 564 expired := hkcfg.CNStoreExpired(cnStore.Tick, state.Tick) 565 566 c.logger.Info( 567 "check cn store expired or not", 568 zap.Any("hakeeper config", hkcfg), 569 zap.Uint64("cn store tick", cnStore.Tick), 570 zap.Uint64("current tick", state.Tick), 571 zap.Bool("expired", expired), 572 ) 573 574 return expired, nil 575 } 576 577 func (c *testCluster) CNStoreExpiredIndexed(index int) (bool, error) { 578 cs, err := c.GetCNServiceIndexed(index) 579 if err != nil { 580 return false, err 581 } 582 return c.CNStoreExpired(cs.ID()) 583 } 584 585 func (c *testCluster) IsClusterHealthy() bool { 586 hkcfg := c.GetHAKeeperConfig() 587 state := c.getClusterState() 588 _, healthy := syshealth.Check( 589 hkcfg, 590 state.GetClusterInfo(), 591 state.GetDNState(), 592 state.GetLogState(), 593 state.GetTick(), 594 ) 595 return healthy 596 } 597 598 // -------------------------------------------------------------- 599 // The following are implements for interface `ClusterWaitState`. 600 // -------------------------------------------------------------- 601 func (c *testCluster) WaitHAKeeperLeader(ctx context.Context) LogService { 602 for { 603 select { 604 case <-ctx.Done(): 605 assert.FailNow( 606 c.t, 607 "terminated when waiting for hakeeper leader", 608 "error: %s", ctx.Err(), 609 ) 610 default: 611 time.Sleep(defaultWaitInterval) 612 613 leader := c.getHAKeeperLeader() 614 if leader != nil { 615 return leader 616 } 617 } 618 } 619 } 620 621 func (c *testCluster) WaitHAKeeperState( 622 ctx context.Context, expected logpb.HAKeeperState, 623 ) { 624 for { 625 select { 626 case <-ctx.Done(): 627 assert.FailNow( 628 c.t, 629 "terminated when waiting for hakeeper state", 630 "error: %s", ctx.Err(), 631 ) 632 default: 633 time.Sleep(defaultWaitInterval) 634 635 state := c.getClusterState() 636 if state == nil { 637 continue 638 } 639 if state.State == expected { 640 return 641 } 642 } 643 } 644 } 645 646 func (c *testCluster) WaitDNShardsReported(ctx context.Context) { 647 for { 648 select { 649 case <-ctx.Done(): 650 assert.FailNow( 651 c.t, 652 "terminated when waiting for all dn shards reported", 653 "error: %s", ctx.Err(), 654 ) 655 default: 656 time.Sleep(defaultWaitInterval) 657 658 state := c.getClusterState() 659 if state == nil { 660 continue 661 } 662 663 expected := ParseExpectedDNShardCount(state.ClusterInfo) 664 reported := ParseReportedDNShardCount( 665 state.DNState, c.GetHAKeeperConfig(), state.Tick, 666 ) 667 668 // FIXME: what about reported larger than expected 669 if reported >= expected { 670 return 671 } 672 } 673 } 674 } 675 676 func (c *testCluster) WaitLogShardsReported(ctx context.Context) { 677 for { 678 select { 679 case <-ctx.Done(): 680 assert.FailNow( 681 c.t, 682 "terminated when waiting for all log shards reported", 683 "error: %s", ctx.Err(), 684 ) 685 default: 686 time.Sleep(defaultWaitInterval) 687 688 state := c.getClusterState() 689 if state == nil { 690 continue 691 } 692 693 expected := ParseExpectedLogShardCount(state.ClusterInfo) 694 reported := ParseReportedLogShardCount( 695 state.LogState, c.GetHAKeeperConfig(), state.Tick, 696 ) 697 // FIXME: what about reported larger than expected 698 if reported >= expected { 699 return 700 } 701 } 702 } 703 } 704 705 func (c *testCluster) WaitDNReplicaReported(ctx context.Context, shardID uint64) { 706 for { 707 select { 708 case <-ctx.Done(): 709 assert.FailNow( 710 c.t, 711 "terminated when waiting replica of dn shard reported", 712 "shard %d, error: %s", shardID, ctx.Err(), 713 ) 714 default: 715 time.Sleep(defaultWaitInterval) 716 717 state := c.getClusterState() 718 if state == nil { 719 continue 720 } 721 722 reported := ParseDNShardReportedSize( 723 shardID, state.DNState, c.GetHAKeeperConfig(), state.Tick, 724 ) 725 if reported >= DNShardExpectedSize { 726 return 727 } 728 } 729 } 730 } 731 732 func (c *testCluster) WaitLogReplicaReported(ctx context.Context, shardID uint64) { 733 for { 734 select { 735 case <-ctx.Done(): 736 assert.FailNow( 737 c.t, 738 "terminated when waiting replica of log shard reported", 739 "shard %d, error: %s", shardID, ctx.Err(), 740 ) 741 default: 742 time.Sleep(defaultWaitInterval) 743 744 state := c.getClusterState() 745 if state == nil { 746 continue 747 } 748 749 expected := ParseLogShardExpectedSize(shardID, state.ClusterInfo) 750 reported := ParseLogShardReportedSize( 751 shardID, state.LogState, c.GetHAKeeperConfig(), state.Tick, 752 ) 753 if reported >= expected { 754 return 755 } 756 } 757 } 758 } 759 760 func (c *testCluster) WaitDNStoreTimeout(ctx context.Context, uuid string) { 761 for { 762 select { 763 case <-ctx.Done(): 764 assert.FailNow( 765 c.t, 766 "terminated when waiting dn store timeout", 767 "dn store %s, error: %s", uuid, ctx.Err(), 768 ) 769 default: 770 time.Sleep(defaultWaitInterval) 771 772 expired, err := c.DNStoreExpired(uuid) 773 if err != nil { 774 c.logger.Error("fail to check dn store expired or not", 775 zap.Error(err), 776 zap.String("uuid", uuid), 777 ) 778 continue 779 } 780 781 if expired { 782 return 783 } 784 } 785 } 786 } 787 788 func (c *testCluster) WaitDNStoreTimeoutIndexed(ctx context.Context, index int) { 789 ds, err := c.GetDNServiceIndexed(index) 790 require.NoError(c.t, err) 791 792 c.WaitDNStoreTimeout(ctx, ds.ID()) 793 } 794 795 func (c *testCluster) WaitDNStoreReported(ctx context.Context, uuid string) { 796 for { 797 select { 798 case <-ctx.Done(): 799 assert.FailNow( 800 c.t, 801 "terminated when waiting dn store reported", 802 "dn store %s, error: %s", uuid, ctx.Err(), 803 ) 804 default: 805 time.Sleep(defaultWaitInterval) 806 807 expired, err := c.DNStoreExpired(uuid) 808 if err != nil { 809 c.logger.Error("fail to check dn store expired or not", 810 zap.Error(err), 811 zap.String("uuid", uuid), 812 ) 813 continue 814 } 815 816 if !expired { 817 return 818 } 819 } 820 } 821 } 822 823 func (c *testCluster) WaitDNStoreReportedIndexed(ctx context.Context, index int) { 824 ds, err := c.GetDNServiceIndexed(index) 825 require.NoError(c.t, err) 826 827 c.WaitDNStoreReported(ctx, ds.ID()) 828 } 829 830 func (c *testCluster) WaitCNStoreReported(ctx context.Context, uuid string) { 831 for { 832 select { 833 case <-ctx.Done(): 834 assert.FailNow( 835 c.t, 836 "terminated when waiting cn store reported", 837 "cn store %s, error: %s", uuid, ctx.Err(), 838 ) 839 default: 840 time.Sleep(defaultWaitInterval) 841 842 expired, err := c.CNStoreExpired(uuid) 843 if err != nil { 844 c.logger.Error("fail to check cn store expired or not", 845 zap.Error(err), 846 zap.String("uuid", uuid), 847 ) 848 continue 849 } 850 851 if !expired { 852 return 853 } 854 } 855 } 856 } 857 858 func (c *testCluster) WaitCNStoreReportedIndexed(ctx context.Context, index int) { 859 ds, err := c.GetCNServiceIndexed(index) 860 require.NoError(c.t, err) 861 862 c.WaitCNStoreReported(ctx, ds.ID()) 863 } 864 865 func (c *testCluster) WaitCNStoreTaskServiceCreated(ctx context.Context, uuid string) { 866 ds, err := c.GetCNService(uuid) 867 require.NoError(c.t, err) 868 869 for { 870 select { 871 case <-ctx.Done(): 872 assert.FailNow( 873 c.t, 874 "terminated when waiting task service created on cn store", 875 "cn store %s, error: %s", uuid, ctx.Err(), 876 ) 877 default: 878 _, ok := ds.GetTaskService() 879 if ok { 880 return 881 } 882 time.Sleep(defaultWaitInterval) 883 } 884 } 885 } 886 887 func (c *testCluster) WaitCNStoreTaskServiceCreatedIndexed(ctx context.Context, index int) { 888 ds, err := c.GetCNServiceIndexed(index) 889 require.NoError(c.t, err) 890 c.WaitCNStoreTaskServiceCreated(ctx, ds.ID()) 891 } 892 893 func (c *testCluster) WaitDNStoreTaskServiceCreated(ctx context.Context, uuid string) { 894 ds, err := c.GetDNService(uuid) 895 require.NoError(c.t, err) 896 897 for { 898 select { 899 case <-ctx.Done(): 900 assert.FailNow( 901 c.t, 902 "terminated when waiting task service created on dn store", 903 "dn store %s, error: %s", uuid, ctx.Err(), 904 ) 905 default: 906 _, ok := ds.GetTaskService() 907 if ok { 908 return 909 } 910 time.Sleep(defaultWaitInterval) 911 } 912 } 913 } 914 915 func (c *testCluster) WaitDNStoreTaskServiceCreatedIndexed(ctx context.Context, index int) { 916 ds, err := c.GetDNServiceIndexed(index) 917 require.NoError(c.t, err) 918 c.WaitDNStoreTaskServiceCreated(ctx, ds.ID()) 919 } 920 921 func (c *testCluster) WaitLogStoreTaskServiceCreated(ctx context.Context, uuid string) { 922 ls, err := c.GetLogService(uuid) 923 require.NoError(c.t, err) 924 925 for { 926 select { 927 case <-ctx.Done(): 928 assert.FailNow( 929 c.t, 930 "terminated when waiting task service created on log store", 931 "log store %s, error: %s", uuid, ctx.Err(), 932 ) 933 default: 934 _, ok := ls.GetTaskService() 935 if ok { 936 return 937 } 938 time.Sleep(defaultWaitInterval) 939 } 940 } 941 } 942 943 func (c *testCluster) WaitLogStoreTaskServiceCreatedIndexed(ctx context.Context, index int) { 944 ds, err := c.GetLogServiceIndexed(index) 945 require.NoError(c.t, err) 946 c.WaitLogStoreTaskServiceCreated(ctx, ds.ID()) 947 } 948 949 func (c *testCluster) WaitLogStoreTimeout(ctx context.Context, uuid string) { 950 for { 951 select { 952 case <-ctx.Done(): 953 assert.FailNow( 954 c.t, 955 "terminated when waiting log store timeout", 956 "log store %s, error: %s", uuid, ctx.Err(), 957 ) 958 default: 959 time.Sleep(defaultWaitInterval) 960 961 expired, err := c.LogStoreExpired(uuid) 962 if err != nil { 963 c.logger.Error("fail to check log store expired or not", 964 zap.Error(err), 965 zap.String("uuid", uuid), 966 ) 967 continue 968 } 969 970 if expired { 971 return 972 } 973 } 974 } 975 } 976 977 func (c *testCluster) WaitLogStoreTimeoutIndexed(ctx context.Context, index int) { 978 ls, err := c.GetLogServiceIndexed(index) 979 require.NoError(c.t, err) 980 981 c.WaitLogStoreTimeout(ctx, ls.ID()) 982 } 983 984 func (c *testCluster) WaitLogStoreReported(ctx context.Context, uuid string) { 985 for { 986 select { 987 case <-ctx.Done(): 988 assert.FailNow( 989 c.t, 990 "terminated when waiting log store reported", 991 "log store %s, error: %s", uuid, ctx.Err(), 992 ) 993 default: 994 time.Sleep(defaultWaitInterval) 995 996 expired, err := c.LogStoreExpired(uuid) 997 if err != nil { 998 c.logger.Error("fail to check log store expired or not", 999 zap.Error(err), 1000 zap.String("uuid", uuid), 1001 ) 1002 continue 1003 } 1004 1005 if !expired { 1006 return 1007 } 1008 } 1009 } 1010 } 1011 1012 func (c *testCluster) WaitLogStoreReportedIndexed(ctx context.Context, index int) { 1013 ls, err := c.GetLogServiceIndexed(index) 1014 require.NoError(c.t, err) 1015 1016 c.WaitLogStoreReported(ctx, ls.ID()) 1017 } 1018 1019 // -------------------------------------------------------------- 1020 // The following are implements for interface `ClusterAwareness`. 1021 // -------------------------------------------------------------- 1022 func (c *testCluster) ListDNServices() []string { 1023 ids := make([]string, 0, len(c.dn.svcs)) 1024 for _, cfg := range c.dn.cfgs { 1025 ids = append(ids, cfg.UUID) 1026 } 1027 return ids 1028 } 1029 1030 func (c *testCluster) ListLogServices() []string { 1031 ids := make([]string, 0, len(c.log.svcs)) 1032 for _, svc := range c.log.svcs { 1033 ids = append(ids, svc.ID()) 1034 } 1035 return ids 1036 } 1037 1038 func (c *testCluster) ListCnServices() []string { 1039 ids := make([]string, 0, len(c.cn.svcs)) 1040 for _, svc := range c.cn.svcs { 1041 ids = append(ids, svc.ID()) 1042 } 1043 return ids 1044 } 1045 1046 func (c *testCluster) ListHAKeeperServices() []LogService { 1047 return c.selectHAkeeperServices() 1048 } 1049 1050 func (c *testCluster) GetDNService(uuid string) (DNService, error) { 1051 c.dn.Lock() 1052 defer c.dn.Unlock() 1053 1054 for i, cfg := range c.dn.cfgs { 1055 if cfg.UUID == uuid { 1056 return c.dn.svcs[i], nil 1057 } 1058 } 1059 return nil, moerr.NewNoServiceNoCtx(uuid) 1060 } 1061 1062 func (c *testCluster) GetLogService(uuid string) (LogService, error) { 1063 c.log.Lock() 1064 defer c.log.Unlock() 1065 1066 for _, svc := range c.log.svcs { 1067 if svc.ID() == uuid { 1068 return svc, nil 1069 } 1070 } 1071 return nil, moerr.NewNoServiceNoCtx(uuid) 1072 } 1073 1074 func (c *testCluster) GetCNService(uuid string) (CNService, error) { 1075 c.log.Lock() 1076 defer c.log.Unlock() 1077 1078 for _, svc := range c.cn.svcs { 1079 if svc.ID() == uuid { 1080 return svc, nil 1081 } 1082 } 1083 return nil, moerr.NewNoServiceNoCtx(uuid) 1084 } 1085 1086 func (c *testCluster) GetDNServiceIndexed(index int) (DNService, error) { 1087 c.dn.Lock() 1088 defer c.dn.Unlock() 1089 1090 if index >= len(c.dn.svcs) || index < 0 { 1091 return nil, moerr.NewInvalidServiceIndexNoCtx(index) 1092 } 1093 return c.dn.svcs[index], nil 1094 } 1095 1096 func (c *testCluster) GetLogServiceIndexed(index int) (LogService, error) { 1097 c.log.Lock() 1098 defer c.log.Unlock() 1099 1100 if index >= len(c.log.svcs) || index < 0 { 1101 return nil, moerr.NewInvalidServiceIndexNoCtx(index) 1102 } 1103 return c.log.svcs[index], nil 1104 } 1105 1106 func (c *testCluster) GetCNServiceIndexed(index int) (CNService, error) { 1107 c.log.Lock() 1108 defer c.log.Unlock() 1109 1110 if index >= len(c.cn.svcs) || index < 0 { 1111 return nil, moerr.NewInvalidServiceIndexNoCtx(index) 1112 } 1113 return c.cn.svcs[index], nil 1114 } 1115 1116 // NB: we could also fetch cluster state from non-leader hakeeper. 1117 func (c *testCluster) GetClusterState( 1118 ctx context.Context, 1119 ) (*logpb.CheckerState, error) { 1120 c.WaitHAKeeperState(ctx, logpb.HAKeeperRunning) 1121 leader := c.WaitHAKeeperLeader(ctx) 1122 return leader.GetClusterState() 1123 } 1124 1125 // -------------------------------------------------------------- 1126 // The following are implements for interface `ClusterOperation`. 1127 // -------------------------------------------------------------- 1128 func (c *testCluster) CloseDNService(uuid string) error { 1129 ds, err := c.GetDNService(uuid) 1130 if err != nil { 1131 return err 1132 } 1133 return ds.Close() 1134 } 1135 1136 func (c *testCluster) StartDNService(uuid string) error { 1137 ds, err := c.GetDNService(uuid) 1138 if err != nil { 1139 return err 1140 } 1141 return ds.Start() 1142 } 1143 1144 func (c *testCluster) CloseDNServiceIndexed(index int) error { 1145 ds, err := c.GetDNServiceIndexed(index) 1146 if err != nil { 1147 return err 1148 } 1149 return ds.Close() 1150 } 1151 1152 func (c *testCluster) StartDNServiceIndexed(index int) error { 1153 ds, err := c.GetDNServiceIndexed(index) 1154 if err != nil { 1155 return err 1156 } 1157 return ds.Start() 1158 } 1159 1160 func (c *testCluster) CloseLogService(uuid string) error { 1161 ls, err := c.GetLogService(uuid) 1162 if err != nil { 1163 return err 1164 } 1165 return ls.Close() 1166 } 1167 1168 func (c *testCluster) StartLogService(uuid string) error { 1169 ls, err := c.GetLogService(uuid) 1170 if err != nil { 1171 return err 1172 } 1173 return ls.Start() 1174 } 1175 1176 func (c *testCluster) CloseLogServiceIndexed(index int) error { 1177 ls, err := c.GetLogServiceIndexed(index) 1178 if err != nil { 1179 return err 1180 } 1181 return ls.Close() 1182 } 1183 1184 func (c *testCluster) StartLogServiceIndexed(index int) error { 1185 ls, err := c.GetLogServiceIndexed(index) 1186 if err != nil { 1187 return err 1188 } 1189 return ls.Start() 1190 } 1191 1192 func (c *testCluster) CloseCNService(uuid string) error { 1193 cs, err := c.GetCNService(uuid) 1194 if err != nil { 1195 return err 1196 } 1197 return cs.Close() 1198 } 1199 1200 func (c *testCluster) StartCNService(uuid string) error { 1201 cs, err := c.GetCNService(uuid) 1202 if err != nil { 1203 return err 1204 } 1205 return cs.Start() 1206 } 1207 1208 func (c *testCluster) CloseCNServiceIndexed(index int) error { 1209 cs, err := c.GetCNServiceIndexed(index) 1210 if err != nil { 1211 return err 1212 } 1213 return cs.Close() 1214 } 1215 1216 func (c *testCluster) StartCNServiceIndexed(index int) error { 1217 cs, err := c.GetCNServiceIndexed(index) 1218 if err != nil { 1219 return err 1220 } 1221 return cs.Start() 1222 } 1223 1224 func (c *testCluster) NewNetworkPartition( 1225 dnIndexes, logIndexes, cnIndexes []uint32, 1226 ) NetworkPartition { 1227 return newNetworkPartition( 1228 c.opt.initial.logServiceNum, logIndexes, 1229 c.opt.initial.dnServiceNum, dnIndexes, 1230 c.opt.initial.cnServiceNum, cnIndexes, 1231 ) 1232 } 1233 1234 func (c *testCluster) RemainingNetworkPartition( 1235 partitions ...NetworkPartition, 1236 ) NetworkPartition { 1237 return remainingNetworkPartition(c.opt.initial.logServiceNum, c.opt.initial.dnServiceNum, 0, partitions...) 1238 } 1239 1240 func (c *testCluster) StartNetworkPartition(parts ...NetworkPartition) { 1241 c.network.Lock() 1242 defer c.network.Unlock() 1243 1244 addressSets := c.network.addresses.buildPartitionAddressSets(parts...) 1245 c.network.addressSets = addressSets 1246 } 1247 1248 func (c *testCluster) CloseNetworkPartition() { 1249 c.network.Lock() 1250 defer c.network.Unlock() 1251 1252 c.network.addressSets = nil 1253 } 1254 1255 // ------------------------------------------------------ 1256 // The following are private utilities for `testCluster`. 1257 // ------------------------------------------------------ 1258 1259 // buildServiceAddresses builds addresses for all services. 1260 func (c *testCluster) buildServiceAddresses() serviceAddresses { 1261 return newServiceAddresses(c.t, c.opt.initial.logServiceNum, 1262 c.opt.initial.dnServiceNum, c.opt.initial.cnServiceNum, c.opt.hostAddr) 1263 } 1264 1265 // buildDNConfigs builds configurations for all dn services. 1266 func (c *testCluster) buildDNConfigs( 1267 address serviceAddresses, 1268 ) ([]*dnservice.Config, []dnOptions) { 1269 batch := c.opt.initial.dnServiceNum 1270 1271 cfgs := make([]*dnservice.Config, 0, batch) 1272 opts := make([]dnOptions, 0, batch) 1273 for i := 0; i < batch; i++ { 1274 cfg := buildDNConfig(i, c.opt, address) 1275 cfgs = append(cfgs, cfg) 1276 1277 localAddr := cfg.ListenAddress 1278 opt := buildDNOptions(cfg, c.backendFilterFactory(localAddr)) 1279 opts = append(opts, opt) 1280 } 1281 return cfgs, opts 1282 } 1283 1284 // buildLogConfigs builds configurations for all log services. 1285 func (c *testCluster) buildLogConfigs( 1286 address serviceAddresses, 1287 ) ([]logservice.Config, []logOptions) { 1288 batch := c.opt.initial.logServiceNum 1289 1290 cfgs := make([]logservice.Config, 0, batch) 1291 opts := make([]logOptions, 0, batch) 1292 for i := 0; i < batch; i++ { 1293 cfg := buildLogConfig(i, c.opt, address) 1294 cfgs = append(cfgs, cfg) 1295 1296 localAddr := cfg.ServiceAddress 1297 opt := buildLogOptions(cfg, c.backendFilterFactory(localAddr)) 1298 opts = append(opts, opt) 1299 } 1300 return cfgs, opts 1301 } 1302 1303 func (c *testCluster) buildCNConfigs( 1304 address serviceAddresses, 1305 ) ([]*cnservice.Config, []cnOptions) { 1306 batch := c.opt.initial.cnServiceNum 1307 1308 cfgs := make([]*cnservice.Config, 0, batch) 1309 opts := make([]cnOptions, 0, batch) 1310 for i := 0; i < batch; i++ { 1311 cfg := buildCNConfig(i, c.opt, address) 1312 cfgs = append(cfgs, cfg) 1313 1314 opt := buildCNOptions() 1315 opt = append(opt, cnservice.WithLogger(c.logger)) 1316 opts = append(opts, opt) 1317 } 1318 return cfgs, opts 1319 } 1320 1321 // initDNServices builds all dn services. 1322 // 1323 // Before initializing dn service, log service must be started already. 1324 func (c *testCluster) initDNServices(fileservices *fileServices) []DNService { 1325 batch := c.opt.initial.dnServiceNum 1326 1327 c.logger.Info("initialize dn services", zap.Int("batch", batch)) 1328 1329 svcs := make([]DNService, 0, batch) 1330 for i := 0; i < batch; i++ { 1331 cfg := c.dn.cfgs[i] 1332 opt := c.dn.opts[i] 1333 fs, err := fileservice.NewFileServices( 1334 defines.LocalFileServiceName, 1335 fileservices.getDNLocalFileService(i), 1336 fileservices.getS3FileService(), 1337 ) 1338 if err != nil { 1339 panic(err) 1340 } 1341 ds, err := newDNService( 1342 cfg, 1343 c.newRuntime(), 1344 fs, 1345 opt) 1346 require.NoError(c.t, err) 1347 1348 c.logger.Info( 1349 "dn service initialized", 1350 zap.Int("index", i), 1351 zap.Any("config", cfg), 1352 ) 1353 1354 svcs = append(svcs, ds) 1355 } 1356 1357 return svcs 1358 } 1359 1360 // initLogServices builds all log services. 1361 func (c *testCluster) initLogServices() []LogService { 1362 batch := c.opt.initial.logServiceNum 1363 1364 c.logger.Info("initialize log services", zap.Int("batch", batch)) 1365 1366 svcs := make([]LogService, 0, batch) 1367 for i := 0; i < batch; i++ { 1368 cfg := c.log.cfgs[i] 1369 opt := c.log.opts[i] 1370 ls, err := newLogService(cfg, testutil.NewFS(), opt) 1371 require.NoError(c.t, err) 1372 1373 c.logger.Info( 1374 "log service initialized", 1375 zap.Int("index", i), 1376 zap.Any("config", cfg), 1377 ) 1378 1379 svcs = append(svcs, ls) 1380 } 1381 return svcs 1382 } 1383 1384 func (c *testCluster) initCNServices(fileservices *fileServices) []CNService { 1385 batch := c.opt.initial.cnServiceNum 1386 1387 c.logger.Info("initialize cn services", zap.Int("batch", batch)) 1388 1389 svcs := make([]CNService, 0, batch) 1390 for i := 0; i < batch; i++ { 1391 cfg := c.cn.cfgs[i] 1392 opt := c.cn.opts[i] 1393 fs, err := fileservice.NewFileServices( 1394 defines.LocalFileServiceName, 1395 fileservices.getCNLocalFileService(i), 1396 fileservices.getS3FileService(), 1397 ) 1398 if err != nil { 1399 panic(err) 1400 } 1401 ctx, cancel := context.WithCancel(context.Background()) 1402 cs, err := newCNService(cfg, ctx, fs, opt) 1403 if err != nil { 1404 panic(err) 1405 } 1406 cs.SetCancel(cancel) 1407 1408 c.logger.Info( 1409 "cn service initialized", 1410 zap.Int("index", i), 1411 zap.Any("config", cfg), 1412 ) 1413 1414 svcs = append(svcs, cs) 1415 } 1416 return svcs 1417 } 1418 1419 // startDNServices initializes and starts all dn services. 1420 func (c *testCluster) startDNServices(ctx context.Context) error { 1421 // initialize all dn services 1422 c.dn.svcs = c.initDNServices(c.fileservices) 1423 1424 // start dn services 1425 for _, ds := range c.dn.svcs { 1426 if err := ds.Start(); err != nil { 1427 return err 1428 } 1429 } 1430 1431 c.WaitDNShardsReported(ctx) 1432 return nil 1433 } 1434 1435 // startLogServices initializes and starts all log services. 1436 func (c *testCluster) startLogServices(ctx context.Context) error { 1437 // initialize all log service 1438 c.log.svcs = c.initLogServices() 1439 1440 // start log services 1441 for _, ls := range c.log.svcs { 1442 if err := ls.Start(); err != nil { 1443 return err 1444 } 1445 } 1446 1447 // start hakeeper replicas 1448 if err := c.startHAKeeperReplica(); err != nil { 1449 return err 1450 } 1451 1452 // initialize cluster information 1453 if err := c.setInitialClusterInfo(); err != nil { 1454 return err 1455 } 1456 1457 c.WaitHAKeeperState(ctx, logpb.HAKeeperRunning) 1458 return nil 1459 } 1460 1461 func (c *testCluster) startCNServices(ctx context.Context) error { 1462 c.cn.svcs = c.initCNServices(c.fileservices) 1463 1464 for _, cs := range c.cn.svcs { 1465 if err := cs.Start(); err != nil { 1466 return err 1467 } 1468 } 1469 1470 if err := c.waitSystemInitCompleted(ctx); err != nil { 1471 return err 1472 } 1473 return nil 1474 } 1475 1476 // closeDNServices closes all dn services. 1477 func (c *testCluster) closeDNServices() error { 1478 c.logger.Info("start to close dn services") 1479 1480 for i, ds := range c.dn.svcs { 1481 c.logger.Info("close dn service", zap.Int("index", i)) 1482 if err := ds.Close(); err != nil { 1483 return err 1484 } 1485 c.logger.Info("dn service closed", zap.Int("index", i)) 1486 } 1487 1488 return nil 1489 } 1490 1491 // closeLogServices closes all log services. 1492 func (c *testCluster) closeLogServices() error { 1493 defer logutil.LogClose(c.logger, "tests-framework/logservices")() 1494 1495 for i, ls := range c.log.svcs { 1496 c.logger.Info("close log service", zap.Int("index", i)) 1497 if err := ls.Close(); err != nil { 1498 return err 1499 } 1500 c.logger.Info("log service closed", zap.Int("index", i)) 1501 } 1502 1503 return nil 1504 } 1505 1506 func (c *testCluster) closeCNServices() error { 1507 defer logutil.LogClose(c.logger, "tests-framework/cnservices")() 1508 1509 for i, cs := range c.cn.svcs { 1510 c.logger.Info("close cn service", zap.Int("index", i)) 1511 if err := cs.Close(); err != nil { 1512 return err 1513 } 1514 c.logger.Info("cn service closed", zap.Int("index", i)) 1515 } 1516 1517 return nil 1518 } 1519 1520 // getClusterState fetches cluster state from arbitrary hakeeper. 1521 // 1522 // NB: it's possible that getClusterState returns nil value. 1523 func (c *testCluster) getClusterState() *logpb.CheckerState { 1524 var state *logpb.CheckerState 1525 fn := func(index int, svc LogService) bool { 1526 s, err := svc.GetClusterState() 1527 if err != nil { 1528 c.logger.Error( 1529 "fail to get cluster state", 1530 zap.Error(err), 1531 zap.Int("index", index), 1532 ) 1533 return false 1534 } 1535 state = s 1536 // XXX MPOOL 1537 // Too much logging can break CI. 1538 // c.logger.Info("current cluster state", zap.Any("state", s)) 1539 return true 1540 } 1541 c.rangeHAKeeperService(fn) 1542 return state 1543 } 1544 1545 // getHAKeeperLeader gets log service which is hakeeper leader. 1546 func (c *testCluster) getHAKeeperLeader() LogService { 1547 var leader LogService 1548 fn := func(index int, svc LogService) bool { 1549 isLeader, err := svc.IsLeaderHakeeper() 1550 if err != nil { 1551 c.logger.Error( 1552 "fail to check hakeeper", 1553 zap.Error(err), 1554 zap.Int("index", index), 1555 ) 1556 return false 1557 } 1558 c.logger.Info( 1559 "hakeeper state", 1560 zap.Bool("isLeader", isLeader), 1561 zap.Int("index", index), 1562 ) 1563 1564 if isLeader { 1565 leader = svc 1566 return true 1567 } 1568 1569 return false 1570 } 1571 c.rangeHAKeeperService(fn) 1572 return leader 1573 } 1574 1575 // rangeHAKeeperService iterates all hakeeper service until `fn` returns true. 1576 func (c *testCluster) rangeHAKeeperService( 1577 fn func(index int, svc LogService) bool, 1578 ) { 1579 for i, svc := range c.selectHAkeeperServices() { 1580 index := i 1581 1582 if svc.Status() != ServiceStarted { 1583 c.logger.Warn( 1584 "hakeeper service not started", 1585 zap.Int("index", index), 1586 ) 1587 continue 1588 } 1589 1590 if fn(index, svc) { 1591 break 1592 } 1593 } 1594 } 1595 1596 func (c *testCluster) waitSystemInitCompleted(ctx context.Context) error { 1597 log, err := c.GetLogServiceIndexed(0) 1598 if err != nil { 1599 return err 1600 } 1601 if err := log.CreateInitTasks(); err != nil { 1602 return err 1603 } 1604 1605 c.WaitCNStoreTaskServiceCreatedIndexed(ctx, 0) 1606 cn, err := c.GetCNServiceIndexed(0) 1607 if err != nil { 1608 return err 1609 } 1610 if err := cn.WaitSystemInitCompleted(ctx); err != nil { 1611 return err 1612 } 1613 return nil 1614 } 1615 1616 func (c *testCluster) newRuntime() runtime.Runtime { 1617 return runtime.NewRuntime(metadata.ServiceType_CN, "", c.logger, runtime.WithClock(c.clock)) 1618 } 1619 1620 // FilterFunc returns true if traffic was allowed. 1621 type FilterFunc func(morpc.Message, string) bool 1622 1623 // backendFilterFactory constructs a closure with the type of FilterFunc. 1624 func (c *testCluster) backendFilterFactory(localAddr string) FilterFunc { 1625 return func(_ morpc.Message, backendAddr string) bool { 1626 // NB: it's possible that partition takes effect once more after disabled. 1627 c.network.RLock() 1628 addressSets := c.network.addressSets 1629 c.network.RUnlock() 1630 1631 if len(addressSets) == 0 { 1632 return true 1633 } 1634 1635 for _, addrSet := range addressSets { 1636 if addrSet.contains(localAddr) && 1637 addrSet.contains(backendAddr) { 1638 return true 1639 } 1640 } 1641 1642 c.logger.Info( 1643 "traffic not allowed", 1644 zap.String("local", localAddr), 1645 zap.String("backend", backendAddr), 1646 ) 1647 1648 return false 1649 } 1650 }