github.com/matrixorigin/matrixone@v0.7.0/pkg/hakeeper/checkers/syshealth/check.go (about)

     1  // Copyright 2021 - 2022 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package syshealth
    16  
    17  import (
    18  	"github.com/matrixorigin/matrixone/pkg/hakeeper"
    19  	"github.com/matrixorigin/matrixone/pkg/hakeeper/operator"
    20  	pb "github.com/matrixorigin/matrixone/pkg/pb/logservice"
    21  )
    22  
    23  const (
    24  	defaultLogShardSize = 3
    25  )
    26  
    27  // Check checks system healthy or not.
    28  // If system wasn't healthy, we would generate
    29  // operators in order to shut down all stores.
    30  func Check(
    31  	cfg hakeeper.Config,
    32  	cluster pb.ClusterInfo,
    33  	dnState pb.DNState,
    34  	logState pb.LogState,
    35  	currTick uint64,
    36  ) ([]*operator.Operator, bool) {
    37  	sysHealthy := true
    38  
    39  	// parse all log stores for expired stores mainly
    40  	logStores := parseLogState(cfg, logState, currTick)
    41  	if len(logStores.expired) == 0 {
    42  		return nil, sysHealthy
    43  	}
    44  
    45  	// check system healthy or not
    46  	expiredShards := listExpiredShards(logStores.expired, logStores.working, logState, cluster)
    47  	for _, shard := range expiredShards {
    48  		// if one of log shards wasn't healthy, the entire system wasn't healthy.
    49  		if !shard.healthy() {
    50  			sysHealthy = false
    51  			break
    52  		}
    53  	}
    54  
    55  	if sysHealthy {
    56  		return nil, sysHealthy
    57  	}
    58  
    59  	// parse all dn stores
    60  	dnStores := parseDnState(cfg, dnState, currTick)
    61  
    62  	// generate operators to shut down all stores
    63  	operators := make([]*operator.Operator, 0, logStores.length()+dnStores.length())
    64  	operators = append(operators, logStores.shutdownExpiredStores()...)
    65  	operators = append(operators, logStores.shutdownWorkingStores()...)
    66  	operators = append(operators, dnStores.shutdownExpiredStores()...)
    67  	operators = append(operators, dnStores.shutdownWorkingStores()...)
    68  
    69  	return operators, sysHealthy
    70  }
    71  
    72  // logShardMap is just a syntax sugar.
    73  type logShardMap map[uint64]*logShard
    74  
    75  func newLogShardMap() logShardMap {
    76  	return make(map[uint64]*logShard)
    77  }
    78  
    79  // registerExpiredReplica registers replica as expired.
    80  func (m logShardMap) registerExpiredReplica(replica pb.LogReplicaInfo, cluster pb.ClusterInfo) {
    81  	replicaID := replica.ReplicaID
    82  	shardID := replica.ShardID
    83  
    84  	if _, ok := m[shardID]; !ok {
    85  		m[shardID] = newLogShard(shardID, getLogShardSize(shardID, cluster))
    86  	}
    87  	m[shardID].registerExpiredReplica(replicaID)
    88  }
    89  
    90  // registerWorkingReplica registers replica as working.
    91  func (m logShardMap) registerWorkingReplica(replica pb.LogReplicaInfo, cluster pb.ClusterInfo) {
    92  	replicaID := replica.ReplicaID
    93  	shardID := replica.ShardID
    94  
    95  	if _, ok := m[shardID]; !ok {
    96  		m[shardID] = newLogShard(shardID, getLogShardSize(shardID, cluster))
    97  	}
    98  	m[shardID].registerWorkingReplica(replicaID)
    99  }
   100  
   101  // listExpiredShards lists those shards which has expired replica.
   102  func listExpiredShards(
   103  	expiredStores map[string]struct{},
   104  	workingStores map[string]struct{},
   105  	logState pb.LogState,
   106  	cluster pb.ClusterInfo,
   107  ) logShardMap {
   108  	expired := newLogShardMap()
   109  
   110  	// register log shards on expired stores
   111  	for id := range expiredStores {
   112  		expiredReplicas := logState.Stores[id].Replicas
   113  		for _, replica := range expiredReplicas {
   114  			expired.registerExpiredReplica(replica, cluster)
   115  		}
   116  	}
   117  
   118  	// register working replica for
   119  	for id := range workingStores {
   120  		workingReplicas := logState.Stores[id].Replicas
   121  		for _, replica := range workingReplicas {
   122  			// only register working replica for expired shards
   123  			if _, ok := expired[replica.ShardID]; ok {
   124  				expired.registerWorkingReplica(replica, cluster)
   125  			}
   126  		}
   127  	}
   128  
   129  	return expired
   130  }
   131  
   132  // getLogShardSize gets raft group size for the specified shard.
   133  func getLogShardSize(shardID uint64, cluster pb.ClusterInfo) uint64 {
   134  	for _, shardMeta := range cluster.LogShards {
   135  		if shardMeta.ShardID == shardID {
   136  			return shardMeta.NumberOfReplicas
   137  		}
   138  	}
   139  	return defaultLogShardSize
   140  }
   141  
   142  // logShard records metadata for log shard.
   143  type logShard struct {
   144  	shardID          uint64
   145  	numberOfReplicas uint64
   146  	expiredReplicas  map[uint64]struct{}
   147  	workingReplicas  map[uint64]struct{}
   148  }
   149  
   150  func newLogShard(shardID uint64, numberOfReplicas uint64) *logShard {
   151  	return &logShard{
   152  		shardID:          shardID,
   153  		numberOfReplicas: numberOfReplicas,
   154  		expiredReplicas:  make(map[uint64]struct{}),
   155  		workingReplicas:  make(map[uint64]struct{}),
   156  	}
   157  }
   158  
   159  // registerExpiredReplica registers expired replica ID.
   160  func (s *logShard) registerExpiredReplica(replicaID uint64) {
   161  	s.expiredReplicas[replicaID] = struct{}{}
   162  }
   163  
   164  // registerWorkingReplica registers working replica ID.
   165  func (s *logShard) registerWorkingReplica(replicaID uint64) {
   166  	s.workingReplicas[replicaID] = struct{}{}
   167  }
   168  
   169  // healthy checks whether log shard working or not.
   170  func (s *logShard) healthy() bool {
   171  	if s.numberOfReplicas > 0 &&
   172  		len(s.workingReplicas)*2 > int(s.numberOfReplicas) {
   173  		return true
   174  	}
   175  	return false
   176  }
   177  
   178  // storeSet separates stores as expired and working.
   179  type storeSet struct {
   180  	serviceType pb.ServiceType
   181  	working     map[string]struct{}
   182  	expired     map[string]struct{}
   183  }
   184  
   185  func newStoreSet(serviceType pb.ServiceType) *storeSet {
   186  	return &storeSet{
   187  		serviceType: serviceType,
   188  		working:     make(map[string]struct{}),
   189  		expired:     make(map[string]struct{}),
   190  	}
   191  }
   192  
   193  // length returns number of all stores within this set.
   194  func (s *storeSet) length() int {
   195  	return len(s.working) + len(s.expired)
   196  }
   197  
   198  // shutdownExpiredStores
   199  func (s *storeSet) shutdownExpiredStores() []*operator.Operator {
   200  	return shutdownStores(s.serviceType, s.expired)
   201  }
   202  
   203  // shutdownWorkingStores generates operators to shut down working stores.
   204  func (s *storeSet) shutdownWorkingStores() []*operator.Operator {
   205  	return shutdownStores(s.serviceType, s.working)
   206  }
   207  
   208  // parseLogState separates log stores as expired and working.
   209  func parseLogState(cfg hakeeper.Config, logState pb.LogState, currTick uint64) *storeSet {
   210  	set := newStoreSet(pb.LogService)
   211  	for id, storeInfo := range logState.Stores {
   212  		if cfg.LogStoreExpired(storeInfo.Tick, currTick) {
   213  			set.expired[id] = struct{}{}
   214  		} else {
   215  			set.working[id] = struct{}{}
   216  		}
   217  	}
   218  	return set
   219  }
   220  
   221  // parseDnState separates dn stores as expired and working.
   222  func parseDnState(cfg hakeeper.Config, dnState pb.DNState, currTick uint64) *storeSet {
   223  	set := newStoreSet(pb.DNService)
   224  	for id, storeInfo := range dnState.Stores {
   225  		if cfg.DNStoreExpired(storeInfo.Tick, currTick) {
   226  			set.expired[id] = struct{}{}
   227  		} else {
   228  			set.working[id] = struct{}{}
   229  		}
   230  	}
   231  	return set
   232  }
   233  
   234  // shutdownStores generates operators to shut down stores.
   235  func shutdownStores(serviceType pb.ServiceType, stores map[string]struct{}) []*operator.Operator {
   236  	ops := make([]*operator.Operator, 0, len(stores))
   237  
   238  	switch serviceType {
   239  	case pb.LogService:
   240  		for id := range stores {
   241  			op := operator.NewOperator(
   242  				"logservice", operator.NoopShardID, operator.NoopEpoch,
   243  				operator.StopLogStore{StoreID: id},
   244  			)
   245  			ops = append(ops, op)
   246  		}
   247  	case pb.DNService:
   248  		for id := range stores {
   249  			op := operator.NewOperator(
   250  				"dnservice", operator.NoopShardID, operator.NoopEpoch,
   251  				operator.StopDnStore{StoreID: id},
   252  			)
   253  			ops = append(ops, op)
   254  		}
   255  	default:
   256  		panic("unexpected service type")
   257  	}
   258  
   259  	return ops
   260  }