github.com/1aal/kubeblocks@v0.0.0-20231107070852-e1c03e598921/pkg/lorry/highavailability/ha.go (about)

     1  /*
     2  Copyright (C) 2022-2023 ApeCloud Co., Ltd
     3  
     4  This file is part of KubeBlocks project
     5  
     6  This program is free software: you can redistribute it and/or modify
     7  it under the terms of the GNU Affero General Public License as published by
     8  the Free Software Foundation, either version 3 of the License, or
     9  (at your option) any later version.
    10  
    11  This program is distributed in the hope that it will be useful
    12  but WITHOUT ANY WARRANTY; without even the implied warranty of
    13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    14  GNU Affero General Public License for more details.
    15  
    16  You should have received a copy of the GNU Affero General Public License
    17  along with this program.  If not, see <http://www.gnu.org/licenses/>.
    18  */
    19  
    20  package highavailability
    21  
    22  import (
    23  	"context"
    24  	"errors"
    25  	"fmt"
    26  	"sort"
    27  	"strings"
    28  	"sync"
    29  	"time"
    30  
    31  	"github.com/go-logr/logr"
    32  	probing "github.com/prometheus-community/pro-bing"
    33  	ctrl "sigs.k8s.io/controller-runtime"
    34  
    35  	dcs3 "github.com/1aal/kubeblocks/pkg/lorry/dcs"
    36  	"github.com/1aal/kubeblocks/pkg/lorry/engines"
    37  	"github.com/1aal/kubeblocks/pkg/lorry/engines/register"
    38  	viper "github.com/1aal/kubeblocks/pkg/viperx"
    39  )
    40  
    41  type Ha struct {
    42  	ctx        context.Context
    43  	dbManager  engines.DBManager
    44  	dcs        dcs3.DCS
    45  	logger     logr.Logger
    46  	deleteLock sync.Mutex
    47  }
    48  
    49  var ha *Ha
    50  
    51  func NewHa() *Ha {
    52  	logger := ctrl.Log.WithName("HA")
    53  
    54  	dcs := dcs3.GetStore()
    55  	manager, err := register.GetDBManager()
    56  	if err != nil {
    57  		logger.Error(err, "No DB Manager")
    58  		return nil
    59  	}
    60  
    61  	ha = &Ha{
    62  		ctx:       context.Background(),
    63  		dcs:       dcs,
    64  		logger:    logger,
    65  		dbManager: manager,
    66  	}
    67  	return ha
    68  }
    69  
    70  func GetHa() *Ha {
    71  	return ha
    72  }
    73  
    74  func (ha *Ha) RunCycle() {
    75  	cluster, err := ha.dcs.GetCluster()
    76  	if err != nil {
    77  		ha.logger.Error(err, "Get Cluster err")
    78  		return
    79  	}
    80  
    81  	if !cluster.HaConfig.IsEnable() {
    82  		return
    83  	}
    84  
    85  	currentMember := cluster.GetMemberWithName(ha.dbManager.GetCurrentMemberName())
    86  
    87  	if cluster.HaConfig.IsDeleting(currentMember) {
    88  		ha.logger.Info("Current Member is deleted!")
    89  		_ = ha.DeleteCurrentMember(ha.ctx, cluster)
    90  		return
    91  	}
    92  
    93  	if !ha.dbManager.IsRunning() {
    94  		ha.logger.Info("DB Service is not running,  wait for lorryctl to start it")
    95  		if ha.dcs.HasLease() {
    96  			_ = ha.dcs.ReleaseLease()
    97  		}
    98  		_ = ha.dbManager.Start(ha.ctx, cluster)
    99  		return
   100  	}
   101  
   102  	DBState := ha.dbManager.GetDBState(ha.ctx, cluster)
   103  	// store leader's db state in dcs
   104  	if cluster.Leader != nil && cluster.Leader.Name == ha.dbManager.GetCurrentMemberName() {
   105  		cluster.Leader.DBState = DBState
   106  	}
   107  
   108  	switch {
   109  	// IsClusterHealthy is just for consensus cluster healthy check.
   110  	// For Replication cluster IsClusterHealthy will always return true,
   111  	// and its cluster's healthy is equal to leader member's healthy.
   112  	case !ha.dbManager.IsClusterHealthy(ha.ctx, cluster):
   113  		ha.logger.Error(nil, "The cluster is not healthy, wait...")
   114  
   115  	case !ha.dbManager.IsCurrentMemberInCluster(ha.ctx, cluster) && int(cluster.Replicas) > len(ha.dbManager.GetMemberAddrs(ha.ctx, cluster)):
   116  		ha.logger.Info("Current member is not in cluster, add it to cluster")
   117  		_ = ha.dbManager.JoinCurrentMemberToCluster(ha.ctx, cluster)
   118  
   119  	case !ha.dbManager.IsCurrentMemberHealthy(ha.ctx, cluster):
   120  		ha.logger.Info("DB Service is not healthy,  do some recover")
   121  		if ha.dcs.HasLease() {
   122  			_ = ha.dcs.ReleaseLease()
   123  		}
   124  	//	dbManager.Recover()
   125  
   126  	case !cluster.IsLocked():
   127  		ha.logger.Info("Cluster has no leader, attempt to take the leader")
   128  		if ha.IsHealthiestMember(ha.ctx, cluster) {
   129  			cluster.Leader.DBState = DBState
   130  			if ha.dcs.AttempAcquireLease() == nil {
   131  				err := ha.dbManager.Promote(ha.ctx, cluster)
   132  				if err != nil {
   133  					ha.logger.Error(err, "Take the leader failed")
   134  					_ = ha.dcs.ReleaseLease()
   135  				} else {
   136  					ha.logger.Info("Take the leader success!")
   137  				}
   138  			}
   139  		}
   140  
   141  	case ha.dcs.HasLease():
   142  		ha.logger.Info("This member is Cluster's leader")
   143  		if cluster.Switchover != nil {
   144  			if cluster.Switchover.Leader == ha.dbManager.GetCurrentMemberName() ||
   145  				(cluster.Switchover.Candidate != "" && cluster.Switchover.Candidate != ha.dbManager.GetCurrentMemberName()) {
   146  				if ha.HasOtherHealthyMember(cluster) {
   147  					_ = ha.dbManager.Demote(ha.ctx)
   148  					_ = ha.dcs.ReleaseLease()
   149  					break
   150  				}
   151  
   152  			} else if cluster.Switchover.Candidate == "" || cluster.Switchover.Candidate == ha.dbManager.GetCurrentMemberName() {
   153  				if !ha.dbManager.IsPromoted(ha.ctx) {
   154  					// wait and retry
   155  					break
   156  				}
   157  				_ = ha.dcs.DeleteSwitchover()
   158  			}
   159  		}
   160  
   161  		if ha.dbManager.HasOtherHealthyLeader(ha.ctx, cluster) != nil {
   162  			// this case is applicable only to consensus cluster, where the db's internal
   163  			// role services as the source of truth.
   164  			// for replicationset cluster,  HasOtherHealthyLeader will always be false.
   165  			ha.logger.Info("Release leader")
   166  			_ = ha.dcs.ReleaseLease()
   167  			break
   168  		}
   169  		err := ha.dbManager.Promote(ha.ctx, cluster)
   170  		if err != nil {
   171  			ha.logger.Error(err, "promote failed")
   172  			break
   173  		}
   174  
   175  		ha.logger.Info("Refresh leader ttl")
   176  		_ = ha.dcs.UpdateLease()
   177  
   178  		if int(cluster.Replicas) < len(ha.dbManager.GetMemberAddrs(ha.ctx, cluster)) && cluster.Replicas != 0 {
   179  			ha.DecreaseClusterReplicas(cluster)
   180  		}
   181  
   182  	case !ha.dcs.HasLease():
   183  		if cluster.Switchover != nil {
   184  			break
   185  		}
   186  		// TODO: In the event that the database service and SQL channel both go down concurrently, eg. Pod deleted,
   187  		// there is no healthy leader node and the lock remains unreleased, attempt to acquire the leader lock.
   188  
   189  		// leaderMember := cluster.GetLeaderMember()
   190  		// lockOwnerIsLeader, _ := ha.dbManager.IsLeaderMember(ha.ctx, cluster, leaderMember)
   191  		// currentMemberIsLeader, _ := ha.dbManager.IsLeader(context.TODO(), cluster)
   192  		// if lockOwnerIsLeader && currentMemberIsLeader {
   193  		// ha.logger.Infof("Lock owner is real Leader, demote myself and follow the real leader")
   194  		_ = ha.dbManager.Demote(ha.ctx)
   195  		_ = ha.dbManager.Follow(ha.ctx, cluster)
   196  	}
   197  }
   198  
   199  func (ha *Ha) Start() {
   200  	ha.logger.Info("HA starting")
   201  	cluster, err := ha.dcs.GetCluster()
   202  	if cluster == nil {
   203  		ha.logger.Error(err, "Get Cluster error, so HA exists.", "cluster-name", ha.dcs.GetClusterName())
   204  		return
   205  	}
   206  
   207  	// isPodReady, err := ha.IsPodReady()
   208  	// for err != nil || !isPodReady {
   209  	//	ha.logger.Info("Waiting for dns resolution to be ready")
   210  	//	time.Sleep(3 * time.Second)
   211  	//	isPodReady, err = ha.IsPodReady()
   212  	// }
   213  	ha.logger.Info("dns resolution is ready")
   214  
   215  	ha.logger.Info(fmt.Sprintf("cluster: %v", cluster))
   216  	isInitialized, err := ha.dbManager.IsClusterInitialized(context.TODO(), cluster)
   217  	for err != nil || !isInitialized {
   218  		ha.logger.Info("Waiting for the database cluster to be initialized.")
   219  		// TODO: implement dbmanager initialize to replace pod's entrypoint scripts
   220  		// if I am the node of index 0, then do initialization
   221  		if err == nil && !isInitialized && ha.dbManager.IsFirstMember() {
   222  			ha.logger.Info("Initialize cluster.")
   223  			err := ha.dbManager.InitializeCluster(ha.ctx, cluster)
   224  			if err != nil {
   225  				ha.logger.Error(err, "Cluster initialize failed")
   226  			}
   227  		}
   228  		time.Sleep(5 * time.Second)
   229  		isInitialized, err = ha.dbManager.IsClusterInitialized(context.TODO(), cluster)
   230  	}
   231  	ha.logger.Info("The database cluster is initialized.")
   232  
   233  	isRootCreated, err := ha.dbManager.IsRootCreated(ha.ctx)
   234  	for err != nil || !isRootCreated {
   235  		if err == nil && !isRootCreated && ha.dbManager.IsFirstMember() {
   236  			ha.logger.Info("Create Root.")
   237  			err := ha.dbManager.CreateRoot(ha.ctx)
   238  			if err != nil {
   239  				ha.logger.Error(err, "Cluster initialize failed")
   240  			}
   241  		}
   242  		time.Sleep(5 * time.Second)
   243  		isRootCreated, err = ha.dbManager.IsRootCreated(ha.ctx)
   244  	}
   245  
   246  	isExist, _ := ha.dcs.IsLeaseExist()
   247  	for !isExist {
   248  		if ok, _ := ha.dbManager.IsLeader(context.Background(), cluster); ok {
   249  			err := ha.dcs.Initialize(cluster)
   250  			if err != nil {
   251  				ha.logger.Error(err, "DCS initialize failed")
   252  				time.Sleep(5 * time.Second)
   253  				continue
   254  			}
   255  			break
   256  		}
   257  
   258  		ha.logger.Info("Waiting for the database Leader to be ready.")
   259  		time.Sleep(5 * time.Second)
   260  		isExist, _ = ha.dcs.IsLeaseExist()
   261  	}
   262  
   263  	for {
   264  		ha.RunCycle()
   265  		time.Sleep(10 * time.Second)
   266  	}
   267  }
   268  
   269  func (ha *Ha) DecreaseClusterReplicas(cluster *dcs3.Cluster) {
   270  	hosts := ha.dbManager.GetMemberAddrs(ha.ctx, cluster)
   271  	sort.Strings(hosts)
   272  	deleteHost := hosts[len(hosts)-1]
   273  	ha.logger.Info("Delete member", "name", deleteHost)
   274  	// The pods in the cluster are managed by a StatefulSet. If the replica count is decreased,
   275  	// then the last pod will be removed first.
   276  	//
   277  	if strings.HasPrefix(deleteHost, ha.dbManager.GetCurrentMemberName()) {
   278  		ha.logger.Info(fmt.Sprintf("The last pod %s is the primary member and cannot be deleted. waiting "+
   279  			"for The controller to perform a switchover to a new primary member before this pod can be removed. ", deleteHost))
   280  		_ = ha.dbManager.Demote(ha.ctx)
   281  		_ = ha.dcs.ReleaseLease()
   282  		return
   283  	}
   284  	memberName := strings.Split(deleteHost, ".")[0]
   285  	member := cluster.GetMemberWithName(memberName)
   286  	if member != nil {
   287  		ha.logger.Info(fmt.Sprintf("member %s exists, do not delete", memberName))
   288  		return
   289  	}
   290  	_ = ha.dbManager.LeaveMemberFromCluster(ha.ctx, cluster, memberName)
   291  }
   292  
   293  func (ha *Ha) IsHealthiestMember(ctx context.Context, cluster *dcs3.Cluster) bool {
   294  	currentMemberName := ha.dbManager.GetCurrentMemberName()
   295  	currentMember := cluster.GetMemberWithName(currentMemberName)
   296  	if cluster.Switchover != nil {
   297  		switchover := cluster.Switchover
   298  		leader := switchover.Leader
   299  		candidate := switchover.Candidate
   300  
   301  		if leader == currentMemberName {
   302  			ha.logger.Info("manual switchover to other member")
   303  			return false
   304  		}
   305  
   306  		if candidate == currentMemberName {
   307  			ha.logger.Info("manual switchover to current member", "member", candidate)
   308  			return true
   309  		}
   310  
   311  		if candidate != "" {
   312  			ha.logger.Info("manual switchover to new leader", "new leader", candidate)
   313  			return false
   314  		}
   315  		return ha.isMinimumLag(ctx, cluster, currentMember)
   316  	}
   317  
   318  	if member := ha.dbManager.HasOtherHealthyLeader(ctx, cluster); member != nil {
   319  		ha.logger.Info("there is a healthy leader exists", "leader", member.Name)
   320  		return false
   321  	}
   322  
   323  	return ha.isMinimumLag(ctx, cluster, currentMember)
   324  }
   325  
   326  func (ha *Ha) HasOtherHealthyMember(cluster *dcs3.Cluster) bool {
   327  	var otherMembers = make([]*dcs3.Member, 0, 1)
   328  	if cluster.Switchover != nil && cluster.Switchover.Candidate != "" {
   329  		candidate := cluster.Switchover.Candidate
   330  		if candidate != ha.dbManager.GetCurrentMemberName() {
   331  			otherMembers = append(otherMembers, cluster.GetMemberWithName(candidate))
   332  		}
   333  	} else {
   334  		for _, member := range cluster.Members {
   335  			if member.Name == ha.dbManager.GetCurrentMemberName() {
   336  				continue
   337  			}
   338  			otherMembers = append(otherMembers, &member)
   339  		}
   340  	}
   341  
   342  	for _, other := range otherMembers {
   343  		if ha.dbManager.IsMemberHealthy(ha.ctx, cluster, other) {
   344  			if isLagging, _ := ha.dbManager.IsMemberLagging(ha.ctx, cluster, other); !isLagging {
   345  				return true
   346  			}
   347  		}
   348  	}
   349  
   350  	return false
   351  }
   352  
   353  func (ha *Ha) DeleteCurrentMember(ctx context.Context, cluster *dcs3.Cluster) error {
   354  	currentMember := cluster.GetMemberWithName(ha.dbManager.GetCurrentMemberName())
   355  	if cluster.HaConfig.IsDeleted(currentMember) {
   356  		return nil
   357  	}
   358  
   359  	ha.deleteLock.Lock()
   360  	defer ha.deleteLock.Unlock()
   361  
   362  	// if current member is leader, take a switchover first
   363  	if ha.dcs.HasLease() {
   364  		for cluster.Switchover != nil {
   365  			ha.logger.Info("cluster is doing switchover, wait for it to finish")
   366  			return nil
   367  		}
   368  
   369  		leaderMember := cluster.GetLeaderMember()
   370  		if len(ha.dbManager.HasOtherHealthyMembers(ctx, cluster, leaderMember.Name)) == 0 {
   371  			message := "cluster has no other healthy members"
   372  			ha.logger.Info(message)
   373  			return errors.New(message)
   374  		}
   375  
   376  		err := ha.dcs.CreateSwitchover(leaderMember.Name, "")
   377  		if err != nil {
   378  			ha.logger.Error(err, "switchover failed")
   379  			return err
   380  		}
   381  
   382  		ha.logger.Info("cluster is doing switchover, wait for it to finish")
   383  		return nil
   384  	}
   385  
   386  	// redistribute the data of the current member among other members if needed
   387  	err := ha.dbManager.MoveData(ctx, cluster)
   388  	if err != nil {
   389  		ha.logger.Error(err, "Move data failed")
   390  		return err
   391  	}
   392  
   393  	// remove current member from db cluster
   394  	err = ha.dbManager.LeaveMemberFromCluster(ctx, cluster, ha.dbManager.GetCurrentMemberName())
   395  	if err != nil {
   396  		ha.logger.Error(err, "Delete member form cluster failed")
   397  		return err
   398  	}
   399  	cluster.HaConfig.FinishDeleted(currentMember)
   400  	return ha.dcs.UpdateHaConfig()
   401  }
   402  
   403  // IsPodReady checks if pod is ready, it can successfully resolve domain
   404  func (ha *Ha) IsPodReady() (bool, error) {
   405  	domain := viper.GetString("KB_POD_FQDN")
   406  	pinger, err := probing.NewPinger(domain)
   407  	if err != nil {
   408  		ha.logger.Error(err, "new pinger failed")
   409  		return false, err
   410  	}
   411  
   412  	pinger.Count = 3
   413  	pinger.Timeout = 3 * time.Second
   414  	pinger.Interval = 500 * time.Millisecond
   415  	err = pinger.Run()
   416  	if err != nil {
   417  		// For container runtimes like Containerd, unprivileged users can't send icmp echo packets.
   418  		// As a temporary workaround, special handling is being implemented to bypass this limitation.
   419  		if strings.Contains(err.Error(), "socket: permission denied") {
   420  			ha.logger.Info("ping failed, socket: permission denied, but temporarily return true")
   421  			return true, nil
   422  		}
   423  		ha.logger.Error(err, fmt.Sprintf("ping domain:%s failed", domain))
   424  		return false, err
   425  	}
   426  
   427  	return true, nil
   428  }
   429  
   430  func (ha *Ha) isMinimumLag(ctx context.Context, cluster *dcs3.Cluster, member *dcs3.Member) bool {
   431  	isCurrentLagging, currentLag := ha.dbManager.IsMemberLagging(ctx, cluster, member)
   432  	if !isCurrentLagging {
   433  		return true
   434  	}
   435  
   436  	for _, m := range cluster.Members {
   437  		if m.Name != member.Name {
   438  			isLagging, lag := ha.dbManager.IsMemberLagging(ctx, cluster, &m)
   439  			// There are other members with smaller lag
   440  			if !isLagging || lag < currentLag {
   441  				return false
   442  			}
   443  		}
   444  	}
   445  
   446  	return true
   447  }
   448  
   449  func (ha *Ha) ShutdownWithWait() {
   450  	ha.dbManager.ShutDownWithWait()
   451  }