github.com/matrixorigin/matrixone@v0.7.0/pkg/hakeeper/checkers/syshealth/check.go (about) 1 // Copyright 2021 - 2022 Matrix Origin 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package syshealth 16 17 import ( 18 "github.com/matrixorigin/matrixone/pkg/hakeeper" 19 "github.com/matrixorigin/matrixone/pkg/hakeeper/operator" 20 pb "github.com/matrixorigin/matrixone/pkg/pb/logservice" 21 ) 22 23 const ( 24 defaultLogShardSize = 3 25 ) 26 27 // Check checks system healthy or not. 28 // If system wasn't healthy, we would generate 29 // operators in order to shut down all stores. 30 func Check( 31 cfg hakeeper.Config, 32 cluster pb.ClusterInfo, 33 dnState pb.DNState, 34 logState pb.LogState, 35 currTick uint64, 36 ) ([]*operator.Operator, bool) { 37 sysHealthy := true 38 39 // parse all log stores for expired stores mainly 40 logStores := parseLogState(cfg, logState, currTick) 41 if len(logStores.expired) == 0 { 42 return nil, sysHealthy 43 } 44 45 // check system healthy or not 46 expiredShards := listExpiredShards(logStores.expired, logStores.working, logState, cluster) 47 for _, shard := range expiredShards { 48 // if one of log shards wasn't healthy, the entire system wasn't healthy. 49 if !shard.healthy() { 50 sysHealthy = false 51 break 52 } 53 } 54 55 if sysHealthy { 56 return nil, sysHealthy 57 } 58 59 // parse all dn stores 60 dnStores := parseDnState(cfg, dnState, currTick) 61 62 // generate operators to shut down all stores 63 operators := make([]*operator.Operator, 0, logStores.length()+dnStores.length()) 64 operators = append(operators, logStores.shutdownExpiredStores()...) 65 operators = append(operators, logStores.shutdownWorkingStores()...) 66 operators = append(operators, dnStores.shutdownExpiredStores()...) 67 operators = append(operators, dnStores.shutdownWorkingStores()...) 68 69 return operators, sysHealthy 70 } 71 72 // logShardMap is just a syntax sugar. 73 type logShardMap map[uint64]*logShard 74 75 func newLogShardMap() logShardMap { 76 return make(map[uint64]*logShard) 77 } 78 79 // registerExpiredReplica registers replica as expired. 80 func (m logShardMap) registerExpiredReplica(replica pb.LogReplicaInfo, cluster pb.ClusterInfo) { 81 replicaID := replica.ReplicaID 82 shardID := replica.ShardID 83 84 if _, ok := m[shardID]; !ok { 85 m[shardID] = newLogShard(shardID, getLogShardSize(shardID, cluster)) 86 } 87 m[shardID].registerExpiredReplica(replicaID) 88 } 89 90 // registerWorkingReplica registers replica as working. 91 func (m logShardMap) registerWorkingReplica(replica pb.LogReplicaInfo, cluster pb.ClusterInfo) { 92 replicaID := replica.ReplicaID 93 shardID := replica.ShardID 94 95 if _, ok := m[shardID]; !ok { 96 m[shardID] = newLogShard(shardID, getLogShardSize(shardID, cluster)) 97 } 98 m[shardID].registerWorkingReplica(replicaID) 99 } 100 101 // listExpiredShards lists those shards which has expired replica. 102 func listExpiredShards( 103 expiredStores map[string]struct{}, 104 workingStores map[string]struct{}, 105 logState pb.LogState, 106 cluster pb.ClusterInfo, 107 ) logShardMap { 108 expired := newLogShardMap() 109 110 // register log shards on expired stores 111 for id := range expiredStores { 112 expiredReplicas := logState.Stores[id].Replicas 113 for _, replica := range expiredReplicas { 114 expired.registerExpiredReplica(replica, cluster) 115 } 116 } 117 118 // register working replica for 119 for id := range workingStores { 120 workingReplicas := logState.Stores[id].Replicas 121 for _, replica := range workingReplicas { 122 // only register working replica for expired shards 123 if _, ok := expired[replica.ShardID]; ok { 124 expired.registerWorkingReplica(replica, cluster) 125 } 126 } 127 } 128 129 return expired 130 } 131 132 // getLogShardSize gets raft group size for the specified shard. 133 func getLogShardSize(shardID uint64, cluster pb.ClusterInfo) uint64 { 134 for _, shardMeta := range cluster.LogShards { 135 if shardMeta.ShardID == shardID { 136 return shardMeta.NumberOfReplicas 137 } 138 } 139 return defaultLogShardSize 140 } 141 142 // logShard records metadata for log shard. 143 type logShard struct { 144 shardID uint64 145 numberOfReplicas uint64 146 expiredReplicas map[uint64]struct{} 147 workingReplicas map[uint64]struct{} 148 } 149 150 func newLogShard(shardID uint64, numberOfReplicas uint64) *logShard { 151 return &logShard{ 152 shardID: shardID, 153 numberOfReplicas: numberOfReplicas, 154 expiredReplicas: make(map[uint64]struct{}), 155 workingReplicas: make(map[uint64]struct{}), 156 } 157 } 158 159 // registerExpiredReplica registers expired replica ID. 160 func (s *logShard) registerExpiredReplica(replicaID uint64) { 161 s.expiredReplicas[replicaID] = struct{}{} 162 } 163 164 // registerWorkingReplica registers working replica ID. 165 func (s *logShard) registerWorkingReplica(replicaID uint64) { 166 s.workingReplicas[replicaID] = struct{}{} 167 } 168 169 // healthy checks whether log shard working or not. 170 func (s *logShard) healthy() bool { 171 if s.numberOfReplicas > 0 && 172 len(s.workingReplicas)*2 > int(s.numberOfReplicas) { 173 return true 174 } 175 return false 176 } 177 178 // storeSet separates stores as expired and working. 179 type storeSet struct { 180 serviceType pb.ServiceType 181 working map[string]struct{} 182 expired map[string]struct{} 183 } 184 185 func newStoreSet(serviceType pb.ServiceType) *storeSet { 186 return &storeSet{ 187 serviceType: serviceType, 188 working: make(map[string]struct{}), 189 expired: make(map[string]struct{}), 190 } 191 } 192 193 // length returns number of all stores within this set. 194 func (s *storeSet) length() int { 195 return len(s.working) + len(s.expired) 196 } 197 198 // shutdownExpiredStores 199 func (s *storeSet) shutdownExpiredStores() []*operator.Operator { 200 return shutdownStores(s.serviceType, s.expired) 201 } 202 203 // shutdownWorkingStores generates operators to shut down working stores. 204 func (s *storeSet) shutdownWorkingStores() []*operator.Operator { 205 return shutdownStores(s.serviceType, s.working) 206 } 207 208 // parseLogState separates log stores as expired and working. 209 func parseLogState(cfg hakeeper.Config, logState pb.LogState, currTick uint64) *storeSet { 210 set := newStoreSet(pb.LogService) 211 for id, storeInfo := range logState.Stores { 212 if cfg.LogStoreExpired(storeInfo.Tick, currTick) { 213 set.expired[id] = struct{}{} 214 } else { 215 set.working[id] = struct{}{} 216 } 217 } 218 return set 219 } 220 221 // parseDnState separates dn stores as expired and working. 222 func parseDnState(cfg hakeeper.Config, dnState pb.DNState, currTick uint64) *storeSet { 223 set := newStoreSet(pb.DNService) 224 for id, storeInfo := range dnState.Stores { 225 if cfg.DNStoreExpired(storeInfo.Tick, currTick) { 226 set.expired[id] = struct{}{} 227 } else { 228 set.working[id] = struct{}{} 229 } 230 } 231 return set 232 } 233 234 // shutdownStores generates operators to shut down stores. 235 func shutdownStores(serviceType pb.ServiceType, stores map[string]struct{}) []*operator.Operator { 236 ops := make([]*operator.Operator, 0, len(stores)) 237 238 switch serviceType { 239 case pb.LogService: 240 for id := range stores { 241 op := operator.NewOperator( 242 "logservice", operator.NoopShardID, operator.NoopEpoch, 243 operator.StopLogStore{StoreID: id}, 244 ) 245 ops = append(ops, op) 246 } 247 case pb.DNService: 248 for id := range stores { 249 op := operator.NewOperator( 250 "dnservice", operator.NoopShardID, operator.NoopEpoch, 251 operator.StopDnStore{StoreID: id}, 252 ) 253 ops = append(ops, op) 254 } 255 default: 256 panic("unexpected service type") 257 } 258 259 return ops 260 }