github.com/matrixorigin/matrixone@v1.2.0/pkg/hakeeper/checkers/syshealth/check.go (about)

     1  // Copyright 2021 - 2022 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package syshealth
    16  
    17  import (
    18  	"fmt"
    19  
    20  	"github.com/matrixorigin/matrixone/pkg/hakeeper"
    21  	"github.com/matrixorigin/matrixone/pkg/hakeeper/operator"
    22  	"github.com/matrixorigin/matrixone/pkg/logutil"
    23  	pb "github.com/matrixorigin/matrixone/pkg/pb/logservice"
    24  )
    25  
    26  const (
    27  	defaultLogShardSize = 3
    28  )
    29  
    30  // Check checks system healthy or not.
    31  // If system wasn't healthy, we would generate
    32  // operators in order to shut down all stores.
    33  func Check(
    34  	cfg hakeeper.Config,
    35  	cluster pb.ClusterInfo,
    36  	tnState pb.TNState,
    37  	logState pb.LogState,
    38  	currTick uint64,
    39  ) ([]*operator.Operator, bool) {
    40  	sysHealthy := true
    41  
    42  	// parse all log stores for expired stores mainly
    43  	logStores := parseLogState(cfg, logState, currTick)
    44  	if len(logStores.expired) == 0 {
    45  		return nil, sysHealthy
    46  	}
    47  
    48  	// check system healthy or not
    49  	expiredShards := listExpiredShards(logStores.expired, logStores.working, logState, cluster)
    50  	for _, shard := range expiredShards {
    51  		// if one of log shards wasn't healthy, the entire system wasn't healthy.
    52  		if !shard.healthy() {
    53  			sysHealthy = false
    54  			break
    55  		}
    56  	}
    57  
    58  	if sysHealthy {
    59  		return nil, sysHealthy
    60  	}
    61  
    62  	detail := "Expired logStore info..."
    63  	for uuid := range logStores.expired {
    64  		detail += fmt.Sprintf("store %s replicas: [", uuid)
    65  		for _, replicaInfo := range logState.Stores[uuid].Replicas {
    66  			detail += fmt.Sprintf("%d-%d, ", replicaInfo.ShardID, replicaInfo.ReplicaID)
    67  		}
    68  		detail += "]; "
    69  	}
    70  
    71  	logutil.GetGlobalLogger().Info(detail)
    72  
    73  	// parse all tn stores
    74  	tnStores := parseTnState(cfg, tnState, currTick)
    75  
    76  	// generate operators to shut down all stores
    77  	operators := make([]*operator.Operator, 0, logStores.length()+tnStores.length())
    78  	operators = append(operators, logStores.shutdownExpiredStores()...)
    79  	operators = append(operators, logStores.shutdownWorkingStores()...)
    80  	operators = append(operators, tnStores.shutdownExpiredStores()...)
    81  	operators = append(operators, tnStores.shutdownWorkingStores()...)
    82  
    83  	return operators, sysHealthy
    84  }
    85  
    86  // logShardMap is just a syntax sugar.
    87  type logShardMap map[uint64]*logShard
    88  
    89  func newLogShardMap() logShardMap {
    90  	return make(map[uint64]*logShard)
    91  }
    92  
    93  // registerExpiredReplica registers replica as expired.
    94  func (m logShardMap) registerExpiredReplica(replica pb.LogReplicaInfo, cluster pb.ClusterInfo) {
    95  	replicaID := replica.ReplicaID
    96  	shardID := replica.ShardID
    97  
    98  	if _, ok := m[shardID]; !ok {
    99  		m[shardID] = newLogShard(shardID, getLogShardSize(shardID, cluster))
   100  	}
   101  	m[shardID].registerExpiredReplica(replicaID)
   102  }
   103  
   104  // registerWorkingReplica registers replica as working.
   105  func (m logShardMap) registerWorkingReplica(replica pb.LogReplicaInfo, cluster pb.ClusterInfo) {
   106  	replicaID := replica.ReplicaID
   107  	shardID := replica.ShardID
   108  
   109  	if _, ok := m[shardID]; !ok {
   110  		m[shardID] = newLogShard(shardID, getLogShardSize(shardID, cluster))
   111  	}
   112  	m[shardID].registerWorkingReplica(replicaID)
   113  }
   114  
   115  // listExpiredShards lists those shards which has expired replica.
   116  func listExpiredShards(
   117  	expiredStores map[string]struct{},
   118  	workingStores map[string]struct{},
   119  	logState pb.LogState,
   120  	cluster pb.ClusterInfo,
   121  ) logShardMap {
   122  	expired := newLogShardMap()
   123  
   124  	// register log shards on expired stores
   125  	for id := range expiredStores {
   126  		expiredReplicas := logState.Stores[id].Replicas
   127  		for _, replica := range expiredReplicas {
   128  			expired.registerExpiredReplica(replica, cluster)
   129  		}
   130  	}
   131  
   132  	// register working replica for
   133  	for id := range workingStores {
   134  		workingReplicas := logState.Stores[id].Replicas
   135  		for _, replica := range workingReplicas {
   136  			// only register working replica for expired shards
   137  			if _, ok := expired[replica.ShardID]; ok {
   138  				expired.registerWorkingReplica(replica, cluster)
   139  			}
   140  		}
   141  	}
   142  
   143  	return expired
   144  }
   145  
   146  // getLogShardSize gets raft group size for the specified shard.
   147  func getLogShardSize(shardID uint64, cluster pb.ClusterInfo) uint64 {
   148  	for _, shardMeta := range cluster.LogShards {
   149  		if shardMeta.ShardID == shardID {
   150  			return shardMeta.NumberOfReplicas
   151  		}
   152  	}
   153  	return defaultLogShardSize
   154  }
   155  
   156  // logShard records metadata for log shard.
   157  type logShard struct {
   158  	shardID          uint64
   159  	numberOfReplicas uint64
   160  	expiredReplicas  map[uint64]struct{}
   161  	workingReplicas  map[uint64]struct{}
   162  }
   163  
   164  func newLogShard(shardID uint64, numberOfReplicas uint64) *logShard {
   165  	return &logShard{
   166  		shardID:          shardID,
   167  		numberOfReplicas: numberOfReplicas,
   168  		expiredReplicas:  make(map[uint64]struct{}),
   169  		workingReplicas:  make(map[uint64]struct{}),
   170  	}
   171  }
   172  
   173  // registerExpiredReplica registers expired replica ID.
   174  func (s *logShard) registerExpiredReplica(replicaID uint64) {
   175  	s.expiredReplicas[replicaID] = struct{}{}
   176  }
   177  
   178  // registerWorkingReplica registers working replica ID.
   179  func (s *logShard) registerWorkingReplica(replicaID uint64) {
   180  	s.workingReplicas[replicaID] = struct{}{}
   181  }
   182  
   183  // healthy checks whether log shard working or not.
   184  func (s *logShard) healthy() bool {
   185  	if s.numberOfReplicas > 0 &&
   186  		len(s.workingReplicas)*2 > int(s.numberOfReplicas) {
   187  		return true
   188  	}
   189  	return false
   190  }
   191  
   192  // storeSet separates stores as expired and working.
   193  type storeSet struct {
   194  	serviceType pb.ServiceType
   195  	working     map[string]struct{}
   196  	expired     map[string]struct{}
   197  }
   198  
   199  func newStoreSet(serviceType pb.ServiceType) *storeSet {
   200  	return &storeSet{
   201  		serviceType: serviceType,
   202  		working:     make(map[string]struct{}),
   203  		expired:     make(map[string]struct{}),
   204  	}
   205  }
   206  
   207  // length returns number of all stores within this set.
   208  func (s *storeSet) length() int {
   209  	return len(s.working) + len(s.expired)
   210  }
   211  
   212  // shutdownExpiredStores
   213  func (s *storeSet) shutdownExpiredStores() []*operator.Operator {
   214  	return shutdownStores(s.serviceType, s.expired)
   215  }
   216  
   217  // shutdownWorkingStores generates operators to shut down working stores.
   218  func (s *storeSet) shutdownWorkingStores() []*operator.Operator {
   219  	return shutdownStores(s.serviceType, s.working)
   220  }
   221  
   222  // parseLogState separates log stores as expired and working.
   223  func parseLogState(cfg hakeeper.Config, logState pb.LogState, currTick uint64) *storeSet {
   224  	set := newStoreSet(pb.LogService)
   225  	for id, storeInfo := range logState.Stores {
   226  		if cfg.LogStoreExpired(storeInfo.Tick, currTick) {
   227  			set.expired[id] = struct{}{}
   228  		} else {
   229  			set.working[id] = struct{}{}
   230  		}
   231  	}
   232  	return set
   233  }
   234  
   235  // parseTnState separates tn stores as expired and working.
   236  func parseTnState(cfg hakeeper.Config, tnState pb.TNState, currTick uint64) *storeSet {
   237  	set := newStoreSet(pb.TNService)
   238  	for id, storeInfo := range tnState.Stores {
   239  		if cfg.TNStoreExpired(storeInfo.Tick, currTick) {
   240  			set.expired[id] = struct{}{}
   241  		} else {
   242  			set.working[id] = struct{}{}
   243  		}
   244  	}
   245  	return set
   246  }
   247  
   248  // shutdownStores generates operators to shut down stores.
   249  func shutdownStores(serviceType pb.ServiceType, stores map[string]struct{}) []*operator.Operator {
   250  	ops := make([]*operator.Operator, 0, len(stores))
   251  
   252  	switch serviceType {
   253  	case pb.LogService:
   254  		for id := range stores {
   255  			op := operator.NewOperator(
   256  				"logservice", operator.NoopShardID, operator.NoopEpoch,
   257  				operator.StopLogStore{StoreID: id},
   258  			)
   259  			ops = append(ops, op)
   260  		}
   261  	case pb.TNService:
   262  		for id := range stores {
   263  			op := operator.NewOperator(
   264  				"dnservice", operator.NoopShardID, operator.NoopEpoch,
   265  				operator.StopTnStore{StoreID: id},
   266  			)
   267  			ops = append(ops, op)
   268  		}
   269  	default:
   270  		panic("unexpected service type")
   271  	}
   272  
   273  	return ops
   274  }