github.com/matrixorigin/matrixone@v1.2.0/pkg/hakeeper/checkers/syshealth/check.go (about) 1 // Copyright 2021 - 2022 Matrix Origin 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package syshealth 16 17 import ( 18 "fmt" 19 20 "github.com/matrixorigin/matrixone/pkg/hakeeper" 21 "github.com/matrixorigin/matrixone/pkg/hakeeper/operator" 22 "github.com/matrixorigin/matrixone/pkg/logutil" 23 pb "github.com/matrixorigin/matrixone/pkg/pb/logservice" 24 ) 25 26 const ( 27 defaultLogShardSize = 3 28 ) 29 30 // Check checks system healthy or not. 31 // If system wasn't healthy, we would generate 32 // operators in order to shut down all stores. 33 func Check( 34 cfg hakeeper.Config, 35 cluster pb.ClusterInfo, 36 tnState pb.TNState, 37 logState pb.LogState, 38 currTick uint64, 39 ) ([]*operator.Operator, bool) { 40 sysHealthy := true 41 42 // parse all log stores for expired stores mainly 43 logStores := parseLogState(cfg, logState, currTick) 44 if len(logStores.expired) == 0 { 45 return nil, sysHealthy 46 } 47 48 // check system healthy or not 49 expiredShards := listExpiredShards(logStores.expired, logStores.working, logState, cluster) 50 for _, shard := range expiredShards { 51 // if one of log shards wasn't healthy, the entire system wasn't healthy. 52 if !shard.healthy() { 53 sysHealthy = false 54 break 55 } 56 } 57 58 if sysHealthy { 59 return nil, sysHealthy 60 } 61 62 detail := "Expired logStore info..." 63 for uuid := range logStores.expired { 64 detail += fmt.Sprintf("store %s replicas: [", uuid) 65 for _, replicaInfo := range logState.Stores[uuid].Replicas { 66 detail += fmt.Sprintf("%d-%d, ", replicaInfo.ShardID, replicaInfo.ReplicaID) 67 } 68 detail += "]; " 69 } 70 71 logutil.GetGlobalLogger().Info(detail) 72 73 // parse all tn stores 74 tnStores := parseTnState(cfg, tnState, currTick) 75 76 // generate operators to shut down all stores 77 operators := make([]*operator.Operator, 0, logStores.length()+tnStores.length()) 78 operators = append(operators, logStores.shutdownExpiredStores()...) 79 operators = append(operators, logStores.shutdownWorkingStores()...) 80 operators = append(operators, tnStores.shutdownExpiredStores()...) 81 operators = append(operators, tnStores.shutdownWorkingStores()...) 82 83 return operators, sysHealthy 84 } 85 86 // logShardMap is just a syntax sugar. 87 type logShardMap map[uint64]*logShard 88 89 func newLogShardMap() logShardMap { 90 return make(map[uint64]*logShard) 91 } 92 93 // registerExpiredReplica registers replica as expired. 94 func (m logShardMap) registerExpiredReplica(replica pb.LogReplicaInfo, cluster pb.ClusterInfo) { 95 replicaID := replica.ReplicaID 96 shardID := replica.ShardID 97 98 if _, ok := m[shardID]; !ok { 99 m[shardID] = newLogShard(shardID, getLogShardSize(shardID, cluster)) 100 } 101 m[shardID].registerExpiredReplica(replicaID) 102 } 103 104 // registerWorkingReplica registers replica as working. 105 func (m logShardMap) registerWorkingReplica(replica pb.LogReplicaInfo, cluster pb.ClusterInfo) { 106 replicaID := replica.ReplicaID 107 shardID := replica.ShardID 108 109 if _, ok := m[shardID]; !ok { 110 m[shardID] = newLogShard(shardID, getLogShardSize(shardID, cluster)) 111 } 112 m[shardID].registerWorkingReplica(replicaID) 113 } 114 115 // listExpiredShards lists those shards which has expired replica. 116 func listExpiredShards( 117 expiredStores map[string]struct{}, 118 workingStores map[string]struct{}, 119 logState pb.LogState, 120 cluster pb.ClusterInfo, 121 ) logShardMap { 122 expired := newLogShardMap() 123 124 // register log shards on expired stores 125 for id := range expiredStores { 126 expiredReplicas := logState.Stores[id].Replicas 127 for _, replica := range expiredReplicas { 128 expired.registerExpiredReplica(replica, cluster) 129 } 130 } 131 132 // register working replica for 133 for id := range workingStores { 134 workingReplicas := logState.Stores[id].Replicas 135 for _, replica := range workingReplicas { 136 // only register working replica for expired shards 137 if _, ok := expired[replica.ShardID]; ok { 138 expired.registerWorkingReplica(replica, cluster) 139 } 140 } 141 } 142 143 return expired 144 } 145 146 // getLogShardSize gets raft group size for the specified shard. 147 func getLogShardSize(shardID uint64, cluster pb.ClusterInfo) uint64 { 148 for _, shardMeta := range cluster.LogShards { 149 if shardMeta.ShardID == shardID { 150 return shardMeta.NumberOfReplicas 151 } 152 } 153 return defaultLogShardSize 154 } 155 156 // logShard records metadata for log shard. 157 type logShard struct { 158 shardID uint64 159 numberOfReplicas uint64 160 expiredReplicas map[uint64]struct{} 161 workingReplicas map[uint64]struct{} 162 } 163 164 func newLogShard(shardID uint64, numberOfReplicas uint64) *logShard { 165 return &logShard{ 166 shardID: shardID, 167 numberOfReplicas: numberOfReplicas, 168 expiredReplicas: make(map[uint64]struct{}), 169 workingReplicas: make(map[uint64]struct{}), 170 } 171 } 172 173 // registerExpiredReplica registers expired replica ID. 174 func (s *logShard) registerExpiredReplica(replicaID uint64) { 175 s.expiredReplicas[replicaID] = struct{}{} 176 } 177 178 // registerWorkingReplica registers working replica ID. 179 func (s *logShard) registerWorkingReplica(replicaID uint64) { 180 s.workingReplicas[replicaID] = struct{}{} 181 } 182 183 // healthy checks whether log shard working or not. 184 func (s *logShard) healthy() bool { 185 if s.numberOfReplicas > 0 && 186 len(s.workingReplicas)*2 > int(s.numberOfReplicas) { 187 return true 188 } 189 return false 190 } 191 192 // storeSet separates stores as expired and working. 193 type storeSet struct { 194 serviceType pb.ServiceType 195 working map[string]struct{} 196 expired map[string]struct{} 197 } 198 199 func newStoreSet(serviceType pb.ServiceType) *storeSet { 200 return &storeSet{ 201 serviceType: serviceType, 202 working: make(map[string]struct{}), 203 expired: make(map[string]struct{}), 204 } 205 } 206 207 // length returns number of all stores within this set. 208 func (s *storeSet) length() int { 209 return len(s.working) + len(s.expired) 210 } 211 212 // shutdownExpiredStores 213 func (s *storeSet) shutdownExpiredStores() []*operator.Operator { 214 return shutdownStores(s.serviceType, s.expired) 215 } 216 217 // shutdownWorkingStores generates operators to shut down working stores. 218 func (s *storeSet) shutdownWorkingStores() []*operator.Operator { 219 return shutdownStores(s.serviceType, s.working) 220 } 221 222 // parseLogState separates log stores as expired and working. 223 func parseLogState(cfg hakeeper.Config, logState pb.LogState, currTick uint64) *storeSet { 224 set := newStoreSet(pb.LogService) 225 for id, storeInfo := range logState.Stores { 226 if cfg.LogStoreExpired(storeInfo.Tick, currTick) { 227 set.expired[id] = struct{}{} 228 } else { 229 set.working[id] = struct{}{} 230 } 231 } 232 return set 233 } 234 235 // parseTnState separates tn stores as expired and working. 236 func parseTnState(cfg hakeeper.Config, tnState pb.TNState, currTick uint64) *storeSet { 237 set := newStoreSet(pb.TNService) 238 for id, storeInfo := range tnState.Stores { 239 if cfg.TNStoreExpired(storeInfo.Tick, currTick) { 240 set.expired[id] = struct{}{} 241 } else { 242 set.working[id] = struct{}{} 243 } 244 } 245 return set 246 } 247 248 // shutdownStores generates operators to shut down stores. 249 func shutdownStores(serviceType pb.ServiceType, stores map[string]struct{}) []*operator.Operator { 250 ops := make([]*operator.Operator, 0, len(stores)) 251 252 switch serviceType { 253 case pb.LogService: 254 for id := range stores { 255 op := operator.NewOperator( 256 "logservice", operator.NoopShardID, operator.NoopEpoch, 257 operator.StopLogStore{StoreID: id}, 258 ) 259 ops = append(ops, op) 260 } 261 case pb.TNService: 262 for id := range stores { 263 op := operator.NewOperator( 264 "dnservice", operator.NoopShardID, operator.NoopEpoch, 265 operator.StopTnStore{StoreID: id}, 266 ) 267 ops = append(ops, op) 268 } 269 default: 270 panic("unexpected service type") 271 } 272 273 return ops 274 }