github.com/matrixorigin/matrixone@v0.7.0/pkg/logservice/store_hakeeper_check.go (about) 1 // Copyright 2021 - 2022 Matrix Origin 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package logservice 16 17 import ( 18 "context" 19 "fmt" 20 "os" 21 "time" 22 23 "go.uber.org/zap" 24 25 "github.com/google/uuid" 26 "github.com/matrixorigin/matrixone/pkg/hakeeper" 27 "github.com/matrixorigin/matrixone/pkg/hakeeper/bootstrap" 28 pb "github.com/matrixorigin/matrixone/pkg/pb/logservice" 29 ) 30 31 const ( 32 minIDAllocCapacity uint64 = 1024 33 defaultIDBatchSize uint64 = 1024 * 10 34 35 hakeeperDefaultTimeout = 2 * time.Second 36 checkBootstrapCycles = 100 37 ) 38 39 type idAllocator struct { 40 // [nextID, lastID] is the range of IDs that can be assigned. 41 // the next ID to be assigned is nextID 42 nextID uint64 43 lastID uint64 44 } 45 46 var _ hakeeper.IDAllocator = (*idAllocator)(nil) 47 48 func newIDAllocator() hakeeper.IDAllocator { 49 return &idAllocator{nextID: 1, lastID: 0} 50 } 51 52 func (a *idAllocator) Next() (uint64, bool) { 53 if a.nextID <= a.lastID { 54 v := a.nextID 55 a.nextID++ 56 return v, true 57 } 58 return 0, false 59 } 60 61 func (a *idAllocator) Set(next uint64, last uint64) { 62 // make sure that this id allocator never emit any id smaller than 63 // K8SIDRangeEnd 64 if next < hakeeper.K8SIDRangeEnd { 65 panic("invalid id allocator range") 66 } 67 a.nextID = next 68 a.lastID = last 69 } 70 71 func (a *idAllocator) Capacity() uint64 { 72 if a.nextID <= a.lastID { 73 return (a.lastID - a.nextID) + 1 74 } 75 return 0 76 } 77 78 func (l *store) setInitialClusterInfo(numOfLogShards uint64, 79 numOfDNShards uint64, numOfLogReplicas uint64) error { 80 cmd := hakeeper.GetInitialClusterRequestCmd(numOfLogShards, 81 numOfDNShards, numOfLogReplicas) 82 ctx, cancel := context.WithTimeout(context.Background(), hakeeperDefaultTimeout) 83 defer cancel() 84 session := l.nh.GetNoOPSession(hakeeper.DefaultHAKeeperShardID) 85 result, err := l.propose(ctx, session, cmd) 86 if err != nil { 87 l.runtime.Logger().Error("failed to propose initial cluster info", zap.Error(err)) 88 return err 89 } 90 if result.Value == uint64(pb.HAKeeperBootstrapFailed) { 91 panic("bootstrap failed") 92 } 93 if result.Value != uint64(pb.HAKeeperCreated) { 94 l.runtime.Logger().Error("initial cluster info already set") 95 } 96 return nil 97 } 98 99 func (l *store) updateIDAlloc(count uint64) error { 100 cmd := hakeeper.GetGetIDCmd(count) 101 ctx, cancel := context.WithTimeout(context.Background(), hakeeperDefaultTimeout) 102 defer cancel() 103 session := l.nh.GetNoOPSession(hakeeper.DefaultHAKeeperShardID) 104 result, err := l.propose(ctx, session, cmd) 105 if err != nil { 106 l.runtime.Logger().Error("propose get id failed", zap.Error(err)) 107 return err 108 } 109 // TODO: add a test for this 110 l.alloc.Set(result.Value, result.Value+count-1) 111 return nil 112 } 113 114 func (l *store) hakeeperCheck() { 115 isLeader, term, err := l.isLeaderHAKeeper() 116 if err != nil { 117 l.runtime.Logger().Error("failed to get HAKeeper Leader ID", zap.Error(err)) 118 return 119 } 120 121 if !isLeader { 122 l.taskScheduler.StopScheduleCronTask() 123 return 124 } 125 state, err := l.getCheckerState() 126 if err != nil { 127 // TODO: check whether this is temp error 128 l.runtime.Logger().Error("failed to get checker state", zap.Error(err)) 129 return 130 } 131 switch state.State { 132 case pb.HAKeeperCreated: 133 l.runtime.Logger().Warn("waiting for initial cluster info to be set, check skipped") 134 return 135 case pb.HAKeeperBootstrapping: 136 l.bootstrap(term, state) 137 case pb.HAKeeperBootstrapCommandsReceived: 138 l.checkBootstrap(state) 139 case pb.HAKeeperBootstrapFailed: 140 l.handleBootstrapFailure() 141 case pb.HAKeeperRunning: 142 l.healthCheck(term, state) 143 l.taskSchedule(state) 144 default: 145 panic("unknown HAKeeper state") 146 } 147 } 148 149 func (l *store) assertHAKeeperState(s pb.HAKeeperState) { 150 state, err := l.getCheckerState() 151 if err != nil { 152 // TODO: check whether this is temp error 153 l.runtime.Logger().Error("failed to get checker state", zap.Error(err)) 154 return 155 } 156 if state.State != s { 157 l.runtime.Logger().Panic("unexpected state", 158 zap.String("expected", s.String()), 159 zap.String("got", state.State.String())) 160 } 161 } 162 163 func (l *store) handleBootstrapFailure() { 164 panic("failed to bootstrap the cluster") 165 } 166 167 func (l *store) healthCheck(term uint64, state *pb.CheckerState) { 168 l.assertHAKeeperState(pb.HAKeeperRunning) 169 defer l.assertHAKeeperState(pb.HAKeeperRunning) 170 cmds, err := l.getScheduleCommand(true, term, state) 171 if err != nil { 172 l.runtime.Logger().Error("failed to get check schedule commands", zap.Error(err)) 173 return 174 } 175 l.runtime.Logger().Debug(fmt.Sprintf("cluster health check generated %d schedule commands", len(cmds))) 176 if len(cmds) > 0 { 177 ctx, cancel := context.WithTimeout(context.Background(), hakeeperDefaultTimeout) 178 defer cancel() 179 for _, cmd := range cmds { 180 l.runtime.Logger().Debug("adding schedule command to hakeeper", zap.String("command", cmd.LogString())) 181 } 182 if err := l.addScheduleCommands(ctx, term, cmds); err != nil { 183 // TODO: check whether this is temp error 184 l.runtime.Logger().Debug("failed to add schedule commands", zap.Error(err)) 185 return 186 } 187 } 188 } 189 190 func (l *store) taskSchedule(state *pb.CheckerState) { 191 l.assertHAKeeperState(pb.HAKeeperRunning) 192 defer l.assertHAKeeperState(pb.HAKeeperRunning) 193 194 switch state.TaskSchedulerState { 195 case pb.TaskSchedulerCreated: 196 l.registerTaskUser() 197 case pb.TaskSchedulerRunning: 198 l.taskScheduler.StartScheduleCronTask() 199 l.taskScheduler.Schedule(state.CNState, state.Tick) 200 case pb.TaskSchedulerStopped: 201 default: 202 panic("unknown TaskScheduler state") 203 } 204 } 205 206 func (l *store) registerTaskUser() { 207 user, ok := getTaskTableUserFromEnv() 208 if !ok { 209 user = randomUser() 210 } 211 212 // TODO: rename TaskTableUser to moadmin 213 if err := l.setTaskTableUser(user); err != nil { 214 l.runtime.Logger().Error("failed to set task table user", zap.Error(err)) 215 } 216 } 217 218 func (l *store) bootstrap(term uint64, state *pb.CheckerState) { 219 cmds, err := l.getScheduleCommand(false, term, state) 220 if err != nil { 221 l.runtime.Logger().Error("failed to get bootstrap schedule commands", zap.Error(err)) 222 return 223 } 224 if len(cmds) > 0 { 225 for _, c := range cmds { 226 l.runtime.Logger().Debug("bootstrap cmd", zap.String("cmd", c.LogString())) 227 } 228 ctx, cancel := context.WithTimeout(context.Background(), hakeeperDefaultTimeout) 229 defer cancel() 230 if err := l.addScheduleCommands(ctx, term, cmds); err != nil { 231 // TODO: check whether this is temp error 232 l.runtime.Logger().Debug("failed to add schedule commands", zap.Error(err)) 233 return 234 } 235 l.bootstrapCheckCycles = checkBootstrapCycles 236 l.bootstrapMgr = bootstrap.NewBootstrapManager(state.ClusterInfo) 237 l.assertHAKeeperState(pb.HAKeeperBootstrapCommandsReceived) 238 } 239 } 240 241 func (l *store) checkBootstrap(state *pb.CheckerState) { 242 if l.bootstrapCheckCycles == 0 { 243 if err := l.setBootstrapState(false); err != nil { 244 panic(err) 245 } 246 l.assertHAKeeperState(pb.HAKeeperBootstrapFailed) 247 } 248 249 if l.bootstrapMgr == nil { 250 l.bootstrapMgr = bootstrap.NewBootstrapManager(state.ClusterInfo) 251 } 252 if !l.bootstrapMgr.CheckBootstrap(state.LogState) { 253 l.bootstrapCheckCycles-- 254 } else { 255 if err := l.setBootstrapState(true); err != nil { 256 panic(err) 257 } 258 l.assertHAKeeperState(pb.HAKeeperRunning) 259 } 260 } 261 262 func (l *store) setBootstrapState(success bool) error { 263 state := pb.HAKeeperRunning 264 if !success { 265 state = pb.HAKeeperBootstrapFailed 266 } 267 cmd := hakeeper.GetSetStateCmd(state) 268 ctx, cancel := context.WithTimeout(context.Background(), hakeeperDefaultTimeout) 269 defer cancel() 270 session := l.nh.GetNoOPSession(hakeeper.DefaultHAKeeperShardID) 271 _, err := l.propose(ctx, session, cmd) 272 return err 273 } 274 275 func (l *store) getCheckerState() (*pb.CheckerState, error) { 276 ctx, cancel := context.WithTimeout(context.Background(), hakeeperDefaultTimeout) 277 defer cancel() 278 s, err := l.read(ctx, hakeeper.DefaultHAKeeperShardID, &hakeeper.StateQuery{}) 279 if err != nil { 280 return &pb.CheckerState{}, err 281 } 282 return s.(*pb.CheckerState), nil 283 } 284 285 func (l *store) getScheduleCommand(check bool, 286 term uint64, state *pb.CheckerState) ([]pb.ScheduleCommand, error) { 287 if l.alloc.Capacity() < minIDAllocCapacity { 288 if err := l.updateIDAlloc(defaultIDBatchSize); err != nil { 289 return nil, err 290 } 291 } 292 293 if check { 294 return l.checker.Check(l.alloc, *state), nil 295 } 296 m := bootstrap.NewBootstrapManager(state.ClusterInfo) 297 return m.Bootstrap(l.alloc, state.DNState, state.LogState) 298 } 299 300 func (l *store) setTaskTableUser(user pb.TaskTableUser) error { 301 cmd := hakeeper.GetTaskTableUserCmd(user) 302 ctx, cancel := context.WithTimeout(context.Background(), hakeeperDefaultTimeout) 303 defer cancel() 304 session := l.nh.GetNoOPSession(hakeeper.DefaultHAKeeperShardID) 305 result, err := l.propose(ctx, session, cmd) 306 if err != nil { 307 l.runtime.Logger().Error("failed to propose task user info", zap.Error(err)) 308 return err 309 } 310 if result.Value == uint64(pb.TaskSchedulerStopped) { 311 panic("failed to set task user") 312 } 313 if result.Value != uint64(pb.TaskSchedulerCreated) { 314 l.runtime.Logger().Error("task user info already set") 315 } 316 return nil 317 } 318 319 const ( 320 moAdminUser = "mo_admin_user" 321 moAdminPassword = "mo_admin_password" 322 ) 323 324 func getTaskTableUserFromEnv() (pb.TaskTableUser, bool) { 325 username, ok := os.LookupEnv(moAdminUser) 326 if !ok { 327 return pb.TaskTableUser{}, false 328 } 329 password, ok := os.LookupEnv(moAdminPassword) 330 if !ok { 331 return pb.TaskTableUser{}, false 332 } 333 if username == "" || password == "" { 334 return pb.TaskTableUser{}, false 335 } 336 return pb.TaskTableUser{Username: username, Password: password}, true 337 } 338 339 func randomUser() pb.TaskTableUser { 340 return pb.TaskTableUser{ 341 Username: uuid.NewString(), 342 Password: uuid.NewString(), 343 } 344 }