github.com/matrixorigin/matrixone@v1.2.0/pkg/logservice/store_hakeeper_check.go (about) 1 // Copyright 2021 - 2022 Matrix Origin 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package logservice 16 17 import ( 18 "context" 19 "fmt" 20 "os" 21 "sync/atomic" 22 "time" 23 24 "go.uber.org/zap" 25 26 "github.com/google/uuid" 27 "github.com/matrixorigin/matrixone/pkg/hakeeper" 28 "github.com/matrixorigin/matrixone/pkg/hakeeper/bootstrap" 29 pb "github.com/matrixorigin/matrixone/pkg/pb/logservice" 30 ) 31 32 const ( 33 minIDAllocCapacity uint64 = 1024 34 defaultIDBatchSize uint64 = 1024 * 10 35 checkBootstrapCycles = 100 36 ) 37 38 var ( 39 hakeeperDefaultTimeout = 2 * time.Second 40 ) 41 42 type idAllocator struct { 43 // [nextID, lastID] is the range of IDs that can be assigned. 44 // the next ID to be assigned is nextID 45 nextID uint64 46 lastID uint64 47 } 48 49 var _ hakeeper.IDAllocator = (*idAllocator)(nil) 50 51 func newIDAllocator() hakeeper.IDAllocator { 52 return &idAllocator{nextID: 1, lastID: 0} 53 } 54 55 func (a *idAllocator) Next() (uint64, bool) { 56 if a.nextID <= a.lastID { 57 v := a.nextID 58 a.nextID++ 59 return v, true 60 } 61 return 0, false 62 } 63 64 func (a *idAllocator) Set(next uint64, last uint64) { 65 // make sure that this id allocator never emit any id smaller than 66 // K8SIDRangeEnd 67 if next < hakeeper.K8SIDRangeEnd { 68 panic("invalid id allocator range") 69 } 70 a.nextID = next 71 a.lastID = last 72 } 73 74 func (a *idAllocator) Capacity() uint64 { 75 if a.nextID <= a.lastID { 76 return (a.lastID - a.nextID) + 1 77 } 78 return 0 79 } 80 81 func (l *store) setInitialClusterInfo(numOfLogShards uint64, 82 numOfTNShards uint64, numOfLogReplicas uint64, nextID uint64, nextIDByKey map[string]uint64) error { 83 cmd := hakeeper.GetInitialClusterRequestCmd(numOfLogShards, 84 numOfTNShards, numOfLogReplicas, nextID, nextIDByKey) 85 ctx, cancel := context.WithTimeout(context.Background(), hakeeperDefaultTimeout) 86 defer cancel() 87 session := l.nh.GetNoOPSession(hakeeper.DefaultHAKeeperShardID) 88 result, err := l.propose(ctx, session, cmd) 89 if err != nil { 90 l.runtime.Logger().Error("failed to propose initial cluster info", zap.Error(err)) 91 return err 92 } 93 if result.Value == uint64(pb.HAKeeperBootstrapFailed) { 94 panic("bootstrap failed") 95 } 96 if result.Value != uint64(pb.HAKeeperCreated) { 97 l.runtime.Logger().Error("initial cluster info already set") 98 } 99 return nil 100 } 101 102 func (l *store) updateIDAlloc(count uint64) error { 103 cmd := hakeeper.GetAllocateIDCmd(pb.CNAllocateID{Batch: count}) 104 ctx, cancel := context.WithTimeout(context.Background(), hakeeperDefaultTimeout) 105 defer cancel() 106 session := l.nh.GetNoOPSession(hakeeper.DefaultHAKeeperShardID) 107 result, err := l.propose(ctx, session, cmd) 108 if err != nil { 109 l.runtime.Logger().Error("propose get id failed", zap.Error(err)) 110 return err 111 } 112 // TODO: add a test for this 113 l.alloc.Set(result.Value, result.Value+count-1) 114 return nil 115 } 116 117 func (l *store) getCheckerStateFromLeader() (*pb.CheckerState, uint64) { 118 isLeader, term, err := l.isLeaderHAKeeper() 119 if err != nil { 120 l.runtime.Logger().Error("failed to get HAKeeper Leader ID", zap.Error(err)) 121 return nil, term 122 } 123 124 if !isLeader { 125 l.taskScheduler.StopScheduleCronTask() 126 return nil, term 127 } 128 state, err := l.getCheckerState() 129 if err != nil { 130 // TODO: check whether this is temp error 131 l.runtime.Logger().Error("failed to get checker state", zap.Error(err)) 132 return nil, term 133 } 134 135 return state, term 136 } 137 138 var debugPrintHAKeeperState atomic.Bool 139 140 func (l *store) hakeeperCheck() { 141 state, term := l.getCheckerStateFromLeader() 142 if state == nil { 143 return 144 } 145 146 switch state.State { 147 case pb.HAKeeperCreated: 148 l.runtime.Logger().Warn("waiting for initial cluster info to be set, check skipped") 149 return 150 case pb.HAKeeperBootstrapping: 151 l.bootstrap(term, state) 152 case pb.HAKeeperBootstrapCommandsReceived: 153 l.checkBootstrap(state) 154 case pb.HAKeeperBootstrapFailed: 155 l.handleBootstrapFailure() 156 case pb.HAKeeperRunning: 157 if debugPrintHAKeeperState.CompareAndSwap(false, true) { 158 l.runtime.Logger().Info("HAKeeper is running", 159 zap.Uint64("next id", state.NextId)) 160 } 161 l.healthCheck(term, state) 162 default: 163 panic("unknown HAKeeper state") 164 } 165 } 166 167 func (l *store) assertHAKeeperState(s pb.HAKeeperState) { 168 state, err := l.getCheckerState() 169 if err != nil { 170 // TODO: check whether this is temp error 171 l.runtime.Logger().Error("failed to get checker state", zap.Error(err)) 172 return 173 } 174 if state.State != s { 175 l.runtime.Logger().Panic("unexpected state", 176 zap.String("expected", s.String()), 177 zap.String("got", state.State.String())) 178 } 179 } 180 181 func (l *store) handleBootstrapFailure() { 182 panic("failed to bootstrap the cluster") 183 } 184 185 func (l *store) healthCheck(term uint64, state *pb.CheckerState) { 186 l.assertHAKeeperState(pb.HAKeeperRunning) 187 defer l.assertHAKeeperState(pb.HAKeeperRunning) 188 cmds, err := l.getScheduleCommand(true, term, state) 189 if err != nil { 190 l.runtime.Logger().Error("failed to get check schedule commands", zap.Error(err)) 191 return 192 } 193 l.runtime.Logger().Debug(fmt.Sprintf("cluster health check generated %d schedule commands", len(cmds))) 194 if len(cmds) > 0 { 195 ctx, cancel := context.WithTimeout(context.Background(), hakeeperDefaultTimeout) 196 defer cancel() 197 for _, cmd := range cmds { 198 l.runtime.Logger().Debug("adding schedule command to hakeeper", zap.String("command", cmd.LogString())) 199 } 200 if err := l.addScheduleCommands(ctx, term, cmds); err != nil { 201 // TODO: check whether this is temp error 202 l.runtime.Logger().Debug("failed to add schedule commands", zap.Error(err)) 203 return 204 } 205 } 206 } 207 208 func (l *store) taskSchedule(state *pb.CheckerState) { 209 l.assertHAKeeperState(pb.HAKeeperRunning) 210 defer l.assertHAKeeperState(pb.HAKeeperRunning) 211 212 switch state.TaskSchedulerState { 213 case pb.TaskSchedulerCreated: 214 l.registerTaskUser() 215 case pb.TaskSchedulerRunning: 216 l.taskScheduler.StartScheduleCronTask() 217 l.taskScheduler.Schedule(state.CNState, state.Tick) 218 case pb.TaskSchedulerStopped: 219 default: 220 panic("unknown TaskScheduler state") 221 } 222 } 223 224 func (l *store) registerTaskUser() { 225 user, ok := getTaskTableUserFromEnv() 226 if !ok { 227 user = randomUser() 228 } 229 230 // TODO: rename TaskTableUser to moadmin 231 if err := l.setTaskTableUser(user); err != nil { 232 l.runtime.Logger().Error("failed to set task table user", zap.Error(err)) 233 } 234 } 235 236 func (l *store) bootstrap(term uint64, state *pb.CheckerState) { 237 cmds, err := l.getScheduleCommand(false, term, state) 238 if err != nil { 239 l.runtime.Logger().Error("failed to get bootstrap schedule commands", zap.Error(err)) 240 return 241 } 242 if len(cmds) > 0 { 243 for _, c := range cmds { 244 l.runtime.Logger().Debug("bootstrap cmd", zap.String("cmd", c.LogString())) 245 } 246 ctx, cancel := context.WithTimeout(context.Background(), hakeeperDefaultTimeout) 247 defer cancel() 248 if err := l.addScheduleCommands(ctx, term, cmds); err != nil { 249 // TODO: check whether this is temp error 250 l.runtime.Logger().Debug("failed to add schedule commands", zap.Error(err)) 251 return 252 } 253 l.bootstrapCheckCycles = checkBootstrapCycles 254 l.bootstrapMgr = bootstrap.NewBootstrapManager(state.ClusterInfo) 255 l.assertHAKeeperState(pb.HAKeeperBootstrapCommandsReceived) 256 } 257 } 258 259 func (l *store) checkBootstrap(state *pb.CheckerState) { 260 if l.bootstrapCheckCycles == 0 { 261 if err := l.setBootstrapState(false); err != nil { 262 panic(err) 263 } 264 l.assertHAKeeperState(pb.HAKeeperBootstrapFailed) 265 } 266 267 if l.bootstrapMgr == nil { 268 l.bootstrapMgr = bootstrap.NewBootstrapManager(state.ClusterInfo) 269 } 270 if !l.bootstrapMgr.CheckBootstrap(state.LogState) { 271 l.bootstrapCheckCycles-- 272 } else { 273 if err := l.setBootstrapState(true); err != nil { 274 panic(err) 275 } 276 l.assertHAKeeperState(pb.HAKeeperRunning) 277 } 278 } 279 280 func (l *store) setBootstrapState(success bool) error { 281 state := pb.HAKeeperRunning 282 if !success { 283 state = pb.HAKeeperBootstrapFailed 284 } 285 cmd := hakeeper.GetSetStateCmd(state) 286 ctx, cancel := context.WithTimeout(context.Background(), hakeeperDefaultTimeout) 287 defer cancel() 288 session := l.nh.GetNoOPSession(hakeeper.DefaultHAKeeperShardID) 289 _, err := l.propose(ctx, session, cmd) 290 return err 291 } 292 293 func (l *store) getCheckerState() (*pb.CheckerState, error) { 294 ctx, cancel := context.WithTimeout(context.Background(), hakeeperDefaultTimeout) 295 defer cancel() 296 s, err := l.read(ctx, hakeeper.DefaultHAKeeperShardID, &hakeeper.StateQuery{}) 297 if err != nil { 298 return &pb.CheckerState{}, err 299 } 300 return s.(*pb.CheckerState), nil 301 } 302 303 func (l *store) getScheduleCommand(check bool, 304 term uint64, state *pb.CheckerState) ([]pb.ScheduleCommand, error) { 305 if l.alloc.Capacity() < minIDAllocCapacity { 306 if err := l.updateIDAlloc(defaultIDBatchSize); err != nil { 307 return nil, err 308 } 309 } 310 311 if check { 312 return l.checker.Check(l.alloc, *state), nil 313 } 314 m := bootstrap.NewBootstrapManager(state.ClusterInfo) 315 return m.Bootstrap(l.alloc, state.TNState, state.LogState) 316 } 317 318 func (l *store) setTaskTableUser(user pb.TaskTableUser) error { 319 cmd := hakeeper.GetTaskTableUserCmd(user) 320 ctx, cancel := context.WithTimeout(context.Background(), hakeeperDefaultTimeout) 321 defer cancel() 322 session := l.nh.GetNoOPSession(hakeeper.DefaultHAKeeperShardID) 323 result, err := l.propose(ctx, session, cmd) 324 if err != nil { 325 l.runtime.Logger().Error("failed to propose task user info", zap.Error(err)) 326 return err 327 } 328 if result.Value == uint64(pb.TaskSchedulerStopped) { 329 panic("failed to set task user") 330 } 331 if result.Value != uint64(pb.TaskSchedulerCreated) { 332 l.runtime.Logger().Error("task user info already set") 333 } 334 return nil 335 } 336 337 const ( 338 moAdminUser = "mo_admin_user" 339 moAdminPassword = "mo_admin_password" 340 ) 341 342 func getTaskTableUserFromEnv() (pb.TaskTableUser, bool) { 343 username, ok := os.LookupEnv(moAdminUser) 344 if !ok { 345 return pb.TaskTableUser{}, false 346 } 347 password, ok := os.LookupEnv(moAdminPassword) 348 if !ok { 349 return pb.TaskTableUser{}, false 350 } 351 if username == "" || password == "" { 352 return pb.TaskTableUser{}, false 353 } 354 return pb.TaskTableUser{Username: username, Password: password}, true 355 } 356 357 func randomUser() pb.TaskTableUser { 358 return pb.TaskTableUser{ 359 Username: uuid.NewString(), 360 Password: uuid.NewString(), 361 } 362 }