github.com/1aal/kubeblocks@v0.0.0-20231107070852-e1c03e598921/pkg/lorry/highavailability/ha.go (about) 1 /* 2 Copyright (C) 2022-2023 ApeCloud Co., Ltd 3 4 This file is part of KubeBlocks project 5 6 This program is free software: you can redistribute it and/or modify 7 it under the terms of the GNU Affero General Public License as published by 8 the Free Software Foundation, either version 3 of the License, or 9 (at your option) any later version. 10 11 This program is distributed in the hope that it will be useful 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU Affero General Public License for more details. 15 16 You should have received a copy of the GNU Affero General Public License 17 along with this program. If not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 package highavailability 21 22 import ( 23 "context" 24 "errors" 25 "fmt" 26 "sort" 27 "strings" 28 "sync" 29 "time" 30 31 "github.com/go-logr/logr" 32 probing "github.com/prometheus-community/pro-bing" 33 ctrl "sigs.k8s.io/controller-runtime" 34 35 dcs3 "github.com/1aal/kubeblocks/pkg/lorry/dcs" 36 "github.com/1aal/kubeblocks/pkg/lorry/engines" 37 "github.com/1aal/kubeblocks/pkg/lorry/engines/register" 38 viper "github.com/1aal/kubeblocks/pkg/viperx" 39 ) 40 41 type Ha struct { 42 ctx context.Context 43 dbManager engines.DBManager 44 dcs dcs3.DCS 45 logger logr.Logger 46 deleteLock sync.Mutex 47 } 48 49 var ha *Ha 50 51 func NewHa() *Ha { 52 logger := ctrl.Log.WithName("HA") 53 54 dcs := dcs3.GetStore() 55 manager, err := register.GetDBManager() 56 if err != nil { 57 logger.Error(err, "No DB Manager") 58 return nil 59 } 60 61 ha = &Ha{ 62 ctx: context.Background(), 63 dcs: dcs, 64 logger: logger, 65 dbManager: manager, 66 } 67 return ha 68 } 69 70 func GetHa() *Ha { 71 return ha 72 } 73 74 func (ha *Ha) RunCycle() { 75 cluster, err := ha.dcs.GetCluster() 76 if err != nil { 77 ha.logger.Error(err, "Get Cluster err") 78 return 79 } 80 81 if !cluster.HaConfig.IsEnable() { 82 return 83 } 84 85 currentMember := cluster.GetMemberWithName(ha.dbManager.GetCurrentMemberName()) 86 87 if cluster.HaConfig.IsDeleting(currentMember) { 88 ha.logger.Info("Current Member is deleted!") 89 _ = ha.DeleteCurrentMember(ha.ctx, cluster) 90 return 91 } 92 93 if !ha.dbManager.IsRunning() { 94 ha.logger.Info("DB Service is not running, wait for lorryctl to start it") 95 if ha.dcs.HasLease() { 96 _ = ha.dcs.ReleaseLease() 97 } 98 _ = ha.dbManager.Start(ha.ctx, cluster) 99 return 100 } 101 102 DBState := ha.dbManager.GetDBState(ha.ctx, cluster) 103 // store leader's db state in dcs 104 if cluster.Leader != nil && cluster.Leader.Name == ha.dbManager.GetCurrentMemberName() { 105 cluster.Leader.DBState = DBState 106 } 107 108 switch { 109 // IsClusterHealthy is just for consensus cluster healthy check. 110 // For Replication cluster IsClusterHealthy will always return true, 111 // and its cluster's healthy is equal to leader member's healthy. 112 case !ha.dbManager.IsClusterHealthy(ha.ctx, cluster): 113 ha.logger.Error(nil, "The cluster is not healthy, wait...") 114 115 case !ha.dbManager.IsCurrentMemberInCluster(ha.ctx, cluster) && int(cluster.Replicas) > len(ha.dbManager.GetMemberAddrs(ha.ctx, cluster)): 116 ha.logger.Info("Current member is not in cluster, add it to cluster") 117 _ = ha.dbManager.JoinCurrentMemberToCluster(ha.ctx, cluster) 118 119 case !ha.dbManager.IsCurrentMemberHealthy(ha.ctx, cluster): 120 ha.logger.Info("DB Service is not healthy, do some recover") 121 if ha.dcs.HasLease() { 122 _ = ha.dcs.ReleaseLease() 123 } 124 // dbManager.Recover() 125 126 case !cluster.IsLocked(): 127 ha.logger.Info("Cluster has no leader, attempt to take the leader") 128 if ha.IsHealthiestMember(ha.ctx, cluster) { 129 cluster.Leader.DBState = DBState 130 if ha.dcs.AttempAcquireLease() == nil { 131 err := ha.dbManager.Promote(ha.ctx, cluster) 132 if err != nil { 133 ha.logger.Error(err, "Take the leader failed") 134 _ = ha.dcs.ReleaseLease() 135 } else { 136 ha.logger.Info("Take the leader success!") 137 } 138 } 139 } 140 141 case ha.dcs.HasLease(): 142 ha.logger.Info("This member is Cluster's leader") 143 if cluster.Switchover != nil { 144 if cluster.Switchover.Leader == ha.dbManager.GetCurrentMemberName() || 145 (cluster.Switchover.Candidate != "" && cluster.Switchover.Candidate != ha.dbManager.GetCurrentMemberName()) { 146 if ha.HasOtherHealthyMember(cluster) { 147 _ = ha.dbManager.Demote(ha.ctx) 148 _ = ha.dcs.ReleaseLease() 149 break 150 } 151 152 } else if cluster.Switchover.Candidate == "" || cluster.Switchover.Candidate == ha.dbManager.GetCurrentMemberName() { 153 if !ha.dbManager.IsPromoted(ha.ctx) { 154 // wait and retry 155 break 156 } 157 _ = ha.dcs.DeleteSwitchover() 158 } 159 } 160 161 if ha.dbManager.HasOtherHealthyLeader(ha.ctx, cluster) != nil { 162 // this case is applicable only to consensus cluster, where the db's internal 163 // role services as the source of truth. 164 // for replicationset cluster, HasOtherHealthyLeader will always be false. 165 ha.logger.Info("Release leader") 166 _ = ha.dcs.ReleaseLease() 167 break 168 } 169 err := ha.dbManager.Promote(ha.ctx, cluster) 170 if err != nil { 171 ha.logger.Error(err, "promote failed") 172 break 173 } 174 175 ha.logger.Info("Refresh leader ttl") 176 _ = ha.dcs.UpdateLease() 177 178 if int(cluster.Replicas) < len(ha.dbManager.GetMemberAddrs(ha.ctx, cluster)) && cluster.Replicas != 0 { 179 ha.DecreaseClusterReplicas(cluster) 180 } 181 182 case !ha.dcs.HasLease(): 183 if cluster.Switchover != nil { 184 break 185 } 186 // TODO: In the event that the database service and SQL channel both go down concurrently, eg. Pod deleted, 187 // there is no healthy leader node and the lock remains unreleased, attempt to acquire the leader lock. 188 189 // leaderMember := cluster.GetLeaderMember() 190 // lockOwnerIsLeader, _ := ha.dbManager.IsLeaderMember(ha.ctx, cluster, leaderMember) 191 // currentMemberIsLeader, _ := ha.dbManager.IsLeader(context.TODO(), cluster) 192 // if lockOwnerIsLeader && currentMemberIsLeader { 193 // ha.logger.Infof("Lock owner is real Leader, demote myself and follow the real leader") 194 _ = ha.dbManager.Demote(ha.ctx) 195 _ = ha.dbManager.Follow(ha.ctx, cluster) 196 } 197 } 198 199 func (ha *Ha) Start() { 200 ha.logger.Info("HA starting") 201 cluster, err := ha.dcs.GetCluster() 202 if cluster == nil { 203 ha.logger.Error(err, "Get Cluster error, so HA exists.", "cluster-name", ha.dcs.GetClusterName()) 204 return 205 } 206 207 // isPodReady, err := ha.IsPodReady() 208 // for err != nil || !isPodReady { 209 // ha.logger.Info("Waiting for dns resolution to be ready") 210 // time.Sleep(3 * time.Second) 211 // isPodReady, err = ha.IsPodReady() 212 // } 213 ha.logger.Info("dns resolution is ready") 214 215 ha.logger.Info(fmt.Sprintf("cluster: %v", cluster)) 216 isInitialized, err := ha.dbManager.IsClusterInitialized(context.TODO(), cluster) 217 for err != nil || !isInitialized { 218 ha.logger.Info("Waiting for the database cluster to be initialized.") 219 // TODO: implement dbmanager initialize to replace pod's entrypoint scripts 220 // if I am the node of index 0, then do initialization 221 if err == nil && !isInitialized && ha.dbManager.IsFirstMember() { 222 ha.logger.Info("Initialize cluster.") 223 err := ha.dbManager.InitializeCluster(ha.ctx, cluster) 224 if err != nil { 225 ha.logger.Error(err, "Cluster initialize failed") 226 } 227 } 228 time.Sleep(5 * time.Second) 229 isInitialized, err = ha.dbManager.IsClusterInitialized(context.TODO(), cluster) 230 } 231 ha.logger.Info("The database cluster is initialized.") 232 233 isRootCreated, err := ha.dbManager.IsRootCreated(ha.ctx) 234 for err != nil || !isRootCreated { 235 if err == nil && !isRootCreated && ha.dbManager.IsFirstMember() { 236 ha.logger.Info("Create Root.") 237 err := ha.dbManager.CreateRoot(ha.ctx) 238 if err != nil { 239 ha.logger.Error(err, "Cluster initialize failed") 240 } 241 } 242 time.Sleep(5 * time.Second) 243 isRootCreated, err = ha.dbManager.IsRootCreated(ha.ctx) 244 } 245 246 isExist, _ := ha.dcs.IsLeaseExist() 247 for !isExist { 248 if ok, _ := ha.dbManager.IsLeader(context.Background(), cluster); ok { 249 err := ha.dcs.Initialize(cluster) 250 if err != nil { 251 ha.logger.Error(err, "DCS initialize failed") 252 time.Sleep(5 * time.Second) 253 continue 254 } 255 break 256 } 257 258 ha.logger.Info("Waiting for the database Leader to be ready.") 259 time.Sleep(5 * time.Second) 260 isExist, _ = ha.dcs.IsLeaseExist() 261 } 262 263 for { 264 ha.RunCycle() 265 time.Sleep(10 * time.Second) 266 } 267 } 268 269 func (ha *Ha) DecreaseClusterReplicas(cluster *dcs3.Cluster) { 270 hosts := ha.dbManager.GetMemberAddrs(ha.ctx, cluster) 271 sort.Strings(hosts) 272 deleteHost := hosts[len(hosts)-1] 273 ha.logger.Info("Delete member", "name", deleteHost) 274 // The pods in the cluster are managed by a StatefulSet. If the replica count is decreased, 275 // then the last pod will be removed first. 276 // 277 if strings.HasPrefix(deleteHost, ha.dbManager.GetCurrentMemberName()) { 278 ha.logger.Info(fmt.Sprintf("The last pod %s is the primary member and cannot be deleted. waiting "+ 279 "for The controller to perform a switchover to a new primary member before this pod can be removed. ", deleteHost)) 280 _ = ha.dbManager.Demote(ha.ctx) 281 _ = ha.dcs.ReleaseLease() 282 return 283 } 284 memberName := strings.Split(deleteHost, ".")[0] 285 member := cluster.GetMemberWithName(memberName) 286 if member != nil { 287 ha.logger.Info(fmt.Sprintf("member %s exists, do not delete", memberName)) 288 return 289 } 290 _ = ha.dbManager.LeaveMemberFromCluster(ha.ctx, cluster, memberName) 291 } 292 293 func (ha *Ha) IsHealthiestMember(ctx context.Context, cluster *dcs3.Cluster) bool { 294 currentMemberName := ha.dbManager.GetCurrentMemberName() 295 currentMember := cluster.GetMemberWithName(currentMemberName) 296 if cluster.Switchover != nil { 297 switchover := cluster.Switchover 298 leader := switchover.Leader 299 candidate := switchover.Candidate 300 301 if leader == currentMemberName { 302 ha.logger.Info("manual switchover to other member") 303 return false 304 } 305 306 if candidate == currentMemberName { 307 ha.logger.Info("manual switchover to current member", "member", candidate) 308 return true 309 } 310 311 if candidate != "" { 312 ha.logger.Info("manual switchover to new leader", "new leader", candidate) 313 return false 314 } 315 return ha.isMinimumLag(ctx, cluster, currentMember) 316 } 317 318 if member := ha.dbManager.HasOtherHealthyLeader(ctx, cluster); member != nil { 319 ha.logger.Info("there is a healthy leader exists", "leader", member.Name) 320 return false 321 } 322 323 return ha.isMinimumLag(ctx, cluster, currentMember) 324 } 325 326 func (ha *Ha) HasOtherHealthyMember(cluster *dcs3.Cluster) bool { 327 var otherMembers = make([]*dcs3.Member, 0, 1) 328 if cluster.Switchover != nil && cluster.Switchover.Candidate != "" { 329 candidate := cluster.Switchover.Candidate 330 if candidate != ha.dbManager.GetCurrentMemberName() { 331 otherMembers = append(otherMembers, cluster.GetMemberWithName(candidate)) 332 } 333 } else { 334 for _, member := range cluster.Members { 335 if member.Name == ha.dbManager.GetCurrentMemberName() { 336 continue 337 } 338 otherMembers = append(otherMembers, &member) 339 } 340 } 341 342 for _, other := range otherMembers { 343 if ha.dbManager.IsMemberHealthy(ha.ctx, cluster, other) { 344 if isLagging, _ := ha.dbManager.IsMemberLagging(ha.ctx, cluster, other); !isLagging { 345 return true 346 } 347 } 348 } 349 350 return false 351 } 352 353 func (ha *Ha) DeleteCurrentMember(ctx context.Context, cluster *dcs3.Cluster) error { 354 currentMember := cluster.GetMemberWithName(ha.dbManager.GetCurrentMemberName()) 355 if cluster.HaConfig.IsDeleted(currentMember) { 356 return nil 357 } 358 359 ha.deleteLock.Lock() 360 defer ha.deleteLock.Unlock() 361 362 // if current member is leader, take a switchover first 363 if ha.dcs.HasLease() { 364 for cluster.Switchover != nil { 365 ha.logger.Info("cluster is doing switchover, wait for it to finish") 366 return nil 367 } 368 369 leaderMember := cluster.GetLeaderMember() 370 if len(ha.dbManager.HasOtherHealthyMembers(ctx, cluster, leaderMember.Name)) == 0 { 371 message := "cluster has no other healthy members" 372 ha.logger.Info(message) 373 return errors.New(message) 374 } 375 376 err := ha.dcs.CreateSwitchover(leaderMember.Name, "") 377 if err != nil { 378 ha.logger.Error(err, "switchover failed") 379 return err 380 } 381 382 ha.logger.Info("cluster is doing switchover, wait for it to finish") 383 return nil 384 } 385 386 // redistribute the data of the current member among other members if needed 387 err := ha.dbManager.MoveData(ctx, cluster) 388 if err != nil { 389 ha.logger.Error(err, "Move data failed") 390 return err 391 } 392 393 // remove current member from db cluster 394 err = ha.dbManager.LeaveMemberFromCluster(ctx, cluster, ha.dbManager.GetCurrentMemberName()) 395 if err != nil { 396 ha.logger.Error(err, "Delete member form cluster failed") 397 return err 398 } 399 cluster.HaConfig.FinishDeleted(currentMember) 400 return ha.dcs.UpdateHaConfig() 401 } 402 403 // IsPodReady checks if pod is ready, it can successfully resolve domain 404 func (ha *Ha) IsPodReady() (bool, error) { 405 domain := viper.GetString("KB_POD_FQDN") 406 pinger, err := probing.NewPinger(domain) 407 if err != nil { 408 ha.logger.Error(err, "new pinger failed") 409 return false, err 410 } 411 412 pinger.Count = 3 413 pinger.Timeout = 3 * time.Second 414 pinger.Interval = 500 * time.Millisecond 415 err = pinger.Run() 416 if err != nil { 417 // For container runtimes like Containerd, unprivileged users can't send icmp echo packets. 418 // As a temporary workaround, special handling is being implemented to bypass this limitation. 419 if strings.Contains(err.Error(), "socket: permission denied") { 420 ha.logger.Info("ping failed, socket: permission denied, but temporarily return true") 421 return true, nil 422 } 423 ha.logger.Error(err, fmt.Sprintf("ping domain:%s failed", domain)) 424 return false, err 425 } 426 427 return true, nil 428 } 429 430 func (ha *Ha) isMinimumLag(ctx context.Context, cluster *dcs3.Cluster, member *dcs3.Member) bool { 431 isCurrentLagging, currentLag := ha.dbManager.IsMemberLagging(ctx, cluster, member) 432 if !isCurrentLagging { 433 return true 434 } 435 436 for _, m := range cluster.Members { 437 if m.Name != member.Name { 438 isLagging, lag := ha.dbManager.IsMemberLagging(ctx, cluster, &m) 439 // There are other members with smaller lag 440 if !isLagging || lag < currentLag { 441 return false 442 } 443 } 444 } 445 446 return true 447 } 448 449 func (ha *Ha) ShutdownWithWait() { 450 ha.dbManager.ShutDownWithWait() 451 }