github.com/1aal/kubeblocks@v0.0.0-20231107070852-e1c03e598921/pkg/lorry/engines/postgres/apecloudpostgres/manager.go (about)

     1  /*
     2  Copyright (C) 2022-2023 ApeCloud Co., Ltd
     3  
     4  This file is part of KubeBlocks project
     5  
     6  This program is free software: you can redistribute it and/or modify
     7  it under the terms of the GNU Affero General Public License as published by
     8  the Free Software Foundation, either version 3 of the License, or
     9  (at your option) any later version.
    10  
    11  This program is distributed in the hope that it will be useful
    12  but WITHOUT ANY WARRANTY; without even the implied warranty of
    13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    14  GNU Affero General Public License for more details.
    15  
    16  You should have received a copy of the GNU Affero General Public License
    17  along with this program.  If not, see <http://www.gnu.org/licenses/>.
    18  */
    19  
    20  package apecloudpostgres
    21  
    22  import (
    23  	"context"
    24  	"fmt"
    25  	"strings"
    26  	"time"
    27  
    28  	"github.com/pkg/errors"
    29  	"github.com/spf13/cast"
    30  	"golang.org/x/exp/slices"
    31  
    32  	"github.com/1aal/kubeblocks/pkg/lorry/dcs"
    33  	"github.com/1aal/kubeblocks/pkg/lorry/engines"
    34  	"github.com/1aal/kubeblocks/pkg/lorry/engines/models"
    35  	"github.com/1aal/kubeblocks/pkg/lorry/engines/postgres"
    36  )
    37  
    38  type Manager struct {
    39  	postgres.Manager
    40  	memberAddrs  []string
    41  	healthStatus *postgres.ConsensusMemberHealthStatus
    42  }
    43  
    44  var _ engines.DBManager = &Manager{}
    45  
    46  var Mgr *Manager
    47  
    48  func NewManager(properties engines.Properties) (engines.DBManager, error) {
    49  	Mgr = &Manager{}
    50  
    51  	baseManager, err := postgres.NewManager(properties)
    52  	if err != nil {
    53  		return nil, errors.Errorf("new base manager failed, err: %v", err)
    54  	}
    55  
    56  	Mgr.Manager = *baseManager.(*postgres.Manager)
    57  	return Mgr, nil
    58  }
    59  
    60  func (mgr *Manager) GetDBState(ctx context.Context, cluster *dcs.Cluster) *dcs.DBState {
    61  	mgr.DBState = nil
    62  	mgr.UnsetIsLeader()
    63  	dbState := &dcs.DBState{
    64  		Extra: map[string]string{},
    65  	}
    66  
    67  	isLeader, err := mgr.IsLeader(ctx, cluster)
    68  	if err != nil {
    69  		mgr.Logger.Error(err, "check is leader failed")
    70  		return nil
    71  	}
    72  	mgr.SetIsLeader(isLeader)
    73  
    74  	memberAddrs := mgr.GetMemberAddrs(ctx, cluster)
    75  	if memberAddrs == nil {
    76  		mgr.Logger.Error(nil, "get member addrs failed")
    77  		return nil
    78  	}
    79  	mgr.memberAddrs = memberAddrs
    80  
    81  	healthStatus, err := mgr.getMemberHealthStatus(ctx, cluster, cluster.GetMemberWithName(mgr.CurrentMemberName))
    82  	if err != nil {
    83  		mgr.Logger.Error(err, "get member health status failed")
    84  		return nil
    85  	}
    86  	mgr.healthStatus = healthStatus
    87  
    88  	mgr.DBState = dbState
    89  	return dbState
    90  }
    91  
    92  func (mgr *Manager) IsLeader(ctx context.Context, cluster *dcs.Cluster) (bool, error) {
    93  	isSet, isLeader := mgr.GetIsLeader()
    94  	if isSet {
    95  		return isLeader, nil
    96  	}
    97  
    98  	return mgr.IsLeaderWithHost(ctx, "")
    99  }
   100  
   101  func (mgr *Manager) IsLeaderWithHost(ctx context.Context, host string) (bool, error) {
   102  	role, err := mgr.GetMemberRoleWithHost(ctx, host)
   103  	if err != nil {
   104  		return false, errors.Errorf("check is leader with host:%s failed, err:%v", host, err)
   105  	}
   106  
   107  	return role == models.LEADER, nil
   108  }
   109  
   110  func (mgr *Manager) IsDBStartupReady() bool {
   111  	ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
   112  	defer cancel()
   113  	if mgr.DBStartupReady {
   114  		return true
   115  	}
   116  
   117  	if !mgr.IsPgReady(ctx) {
   118  		return false
   119  	}
   120  
   121  	if !mgr.isConsensusReadyUp(ctx) {
   122  		return false
   123  	}
   124  
   125  	mgr.DBStartupReady = true
   126  	mgr.Logger.Info("DB startup ready")
   127  	return true
   128  }
   129  
   130  func (mgr *Manager) isConsensusReadyUp(ctx context.Context) bool {
   131  	sql := `SELECT extname FROM pg_extension WHERE extname = 'consensus_monitor';`
   132  	resp, err := mgr.Query(ctx, sql)
   133  	if err != nil {
   134  		mgr.Logger.Error(err, fmt.Sprintf("query sql:%s failed", sql))
   135  		return false
   136  	}
   137  
   138  	resMap, err := postgres.ParseQuery(string(resp))
   139  	if err != nil {
   140  		mgr.Logger.Error(err, fmt.Sprintf("parse query response:%s failed", string(resp)))
   141  		return false
   142  	}
   143  
   144  	return resMap[0]["extname"] != nil
   145  }
   146  
   147  func (mgr *Manager) IsClusterInitialized(ctx context.Context, cluster *dcs.Cluster) (bool, error) {
   148  	if !mgr.IsFirstMember() {
   149  		mgr.Logger.Info("I am not the first member, just skip and wait for the first member to initialize the cluster.")
   150  		return true, nil
   151  	}
   152  
   153  	if !mgr.IsDBStartupReady() {
   154  		return false, nil
   155  	}
   156  
   157  	sql := `SELECT usename FROM pg_user WHERE usename = 'replicator';`
   158  	resp, err := mgr.Query(ctx, sql)
   159  	if err != nil {
   160  		mgr.Logger.Error(err, fmt.Sprintf("query sql:%s failed", sql))
   161  		return false, err
   162  	}
   163  
   164  	resMap, err := postgres.ParseQuery(string(resp))
   165  	if err != nil {
   166  		mgr.Logger.Error(err, fmt.Sprintf("parse query response:%s failed", string(resp)))
   167  		return false, err
   168  	}
   169  
   170  	return resMap[0]["usename"] != nil, nil
   171  }
   172  
   173  func (mgr *Manager) InitializeCluster(ctx context.Context, cluster *dcs.Cluster) error {
   174  	sql := "create role replicator with superuser login password 'replicator';" +
   175  		"create extension if not exists consensus_monitor;"
   176  
   177  	_, err := mgr.Exec(ctx, sql)
   178  	return err
   179  }
   180  
   181  func (mgr *Manager) GetMemberRoleWithHost(ctx context.Context, host string) (string, error) {
   182  	sql := `select paxos_role from consensus_member_status;`
   183  
   184  	resp, err := mgr.QueryWithHost(ctx, sql, host)
   185  	if err != nil {
   186  		mgr.Logger.Error(err, fmt.Sprintf("query sql:%s failed", sql))
   187  		return "", err
   188  	}
   189  
   190  	resMap, err := postgres.ParseQuery(string(resp))
   191  	if err != nil {
   192  		mgr.Logger.Error(err, fmt.Sprintf("parse query response:%s failed", string(resp)))
   193  		return "", err
   194  	}
   195  
   196  	// TODO:paxos roles are currently represented by numbers, will change to string in the future
   197  	var role string
   198  	switch cast.ToInt(resMap[0]["paxos_role"]) {
   199  	case 0:
   200  		role = models.FOLLOWER
   201  	case 1:
   202  		role = models.CANDIDATE
   203  	case 2:
   204  		role = models.LEADER
   205  	case 3:
   206  		role = models.LEARNER
   207  	default:
   208  		mgr.Logger.Info(fmt.Sprintf("get invalid role number:%d", cast.ToInt(resMap[0]["paxos_role"])))
   209  		role = ""
   210  	}
   211  
   212  	return role, nil
   213  }
   214  
   215  func (mgr *Manager) GetMemberAddrs(ctx context.Context, cluster *dcs.Cluster) []string {
   216  	if mgr.DBState != nil && mgr.memberAddrs != nil {
   217  		return mgr.memberAddrs
   218  	}
   219  
   220  	sql := `select ip_port from consensus_cluster_status;`
   221  	resp, err := mgr.QueryLeader(ctx, sql, cluster)
   222  	if err != nil {
   223  		mgr.Logger.Error(err, fmt.Sprintf("query %s with leader failed", sql))
   224  		return nil
   225  	}
   226  
   227  	result, err := postgres.ParseQuery(string(resp))
   228  	if err != nil {
   229  		mgr.Logger.Error(err, fmt.Sprintf("parse query response:%s failed", string(resp)))
   230  		return nil
   231  	}
   232  
   233  	var addrs []string
   234  	for _, m := range result {
   235  		addrs = append(addrs, strings.Split(cast.ToString(m["ip_port"]), ":")[0])
   236  	}
   237  
   238  	return addrs
   239  }
   240  
   241  func (mgr *Manager) IsCurrentMemberInCluster(ctx context.Context, cluster *dcs.Cluster) bool {
   242  	memberAddrs := mgr.GetMemberAddrs(ctx, cluster)
   243  	// AddCurrentMemberToCluster is executed only when memberAddrs are successfully obtained and memberAddrs not Contains CurrentMember
   244  	if memberAddrs != nil && !slices.Contains(memberAddrs, cluster.GetMemberAddrWithName(mgr.CurrentMemberName)) {
   245  		return false
   246  	}
   247  	return true
   248  }
   249  
   250  func (mgr *Manager) IsCurrentMemberHealthy(ctx context.Context, cluster *dcs.Cluster) bool {
   251  	return mgr.IsMemberHealthy(ctx, cluster, cluster.GetMemberWithName(mgr.CurrentMemberName))
   252  }
   253  
   254  // IsMemberHealthy firstly get the leader's connection pool,
   255  // because only leader can get the cluster healthy view
   256  func (mgr *Manager) IsMemberHealthy(ctx context.Context, cluster *dcs.Cluster, member *dcs.Member) bool {
   257  	healthStatus, err := mgr.getMemberHealthStatus(ctx, cluster, member)
   258  	if errors.Is(err, postgres.ClusterHasNoLeader) {
   259  		mgr.Logger.Info("cluster has no leader, will compete the leader lock")
   260  		return true
   261  	} else if err != nil {
   262  		mgr.Logger.Error(err, "check member healthy failed")
   263  		return false
   264  	}
   265  
   266  	return healthStatus.Connected
   267  }
   268  
   269  func (mgr *Manager) getMemberHealthStatus(ctx context.Context, cluster *dcs.Cluster, member *dcs.Member) (*postgres.ConsensusMemberHealthStatus, error) {
   270  	if mgr.DBState != nil && mgr.healthStatus != nil {
   271  		return mgr.healthStatus, nil
   272  	}
   273  	res := &postgres.ConsensusMemberHealthStatus{}
   274  
   275  	IPPort := mgr.Config.GetConsensusIPPort(cluster, member.Name)
   276  	sql := fmt.Sprintf(`select connected, log_delay_num from consensus_cluster_health where ip_port = '%s';`, IPPort)
   277  	resp, err := mgr.QueryLeader(ctx, sql, cluster)
   278  	if err != nil {
   279  		return nil, err
   280  	}
   281  
   282  	resMap, err := postgres.ParseQuery(string(resp))
   283  	if err != nil {
   284  		return nil, err
   285  	}
   286  
   287  	if resMap[0]["connected"] != nil {
   288  		res.Connected = cast.ToBool(resMap[0]["connected"])
   289  	}
   290  	if resMap[0]["log_delay_num"] != nil {
   291  		res.LogDelayNum = cast.ToInt64(resMap[0]["log_delay_num"])
   292  	}
   293  
   294  	return res, nil
   295  }
   296  
   297  func (mgr *Manager) IsMemberLagging(ctx context.Context, cluster *dcs.Cluster, member *dcs.Member) (bool, int64) {
   298  	healthStatus, err := mgr.getMemberHealthStatus(ctx, cluster, member)
   299  	if errors.Is(err, postgres.ClusterHasNoLeader) {
   300  		mgr.Logger.Info("cluster has no leader, so member has no lag")
   301  		return false, 0
   302  	} else if err != nil {
   303  		mgr.Logger.Error(err, "check member lag failed")
   304  		return true, cluster.HaConfig.GetMaxLagOnSwitchover() + 1
   305  	}
   306  
   307  	return healthStatus.LogDelayNum > cluster.HaConfig.GetMaxLagOnSwitchover(), healthStatus.LogDelayNum
   308  }
   309  
   310  func (mgr *Manager) JoinCurrentMemberToCluster(ctx context.Context, cluster *dcs.Cluster) error {
   311  	sql := fmt.Sprintf(`alter system consensus add follower '%s:%d';`,
   312  		cluster.GetMemberAddrWithName(mgr.CurrentMemberName), mgr.Config.GetDBPort())
   313  
   314  	_, err := mgr.ExecLeader(ctx, sql, cluster)
   315  	if err != nil {
   316  		mgr.Logger.Error(err, fmt.Sprintf("exec sql:%s failed", sql))
   317  		return err
   318  	}
   319  
   320  	return nil
   321  }
   322  
   323  func (mgr *Manager) LeaveMemberFromCluster(ctx context.Context, cluster *dcs.Cluster, host string) error {
   324  	sql := fmt.Sprintf(`alter system consensus drop follower '%s:%d';`,
   325  		host, mgr.Config.GetDBPort())
   326  
   327  	// only leader can delete member, so don't need to get pool
   328  	_, err := mgr.ExecWithHost(ctx, sql, "")
   329  	if err != nil {
   330  		mgr.Logger.Error(err, fmt.Sprintf("exec sql:%s failed", sql))
   331  		return err
   332  	}
   333  
   334  	return nil
   335  }
   336  
   337  // IsClusterHealthy considers the health status of the cluster equivalent to the health status of the leader
   338  func (mgr *Manager) IsClusterHealthy(ctx context.Context, cluster *dcs.Cluster) bool {
   339  	leaderMember := cluster.GetLeaderMember()
   340  	if leaderMember == nil {
   341  		mgr.Logger.Info("cluster has no leader, wait for leader to take the lock")
   342  		// when cluster has no leader, the health status of the cluster is assumed to be true by default,
   343  		// in order to proceed with the logic of competing for the leader lock
   344  		return true
   345  	}
   346  
   347  	if leaderMember.Name == mgr.CurrentMemberName {
   348  		// if the member is leader, then its health status will check in IsMemberHealthy later
   349  		return true
   350  	}
   351  
   352  	return mgr.IsMemberHealthy(ctx, cluster, leaderMember)
   353  }
   354  
   355  func (mgr *Manager) Promote(ctx context.Context, cluster *dcs.Cluster) error {
   356  	if isLeader, err := mgr.IsLeader(ctx, nil); isLeader && err == nil {
   357  		mgr.Logger.Info("i am already the leader, don't need to promote")
   358  		return nil
   359  	}
   360  
   361  	// TODO:will get leader ip_port from consensus_member_status directly in the future
   362  	sql := `select ip_port from consensus_cluster_status where server_id = (select current_leader from consensus_member_status);`
   363  	resp, err := mgr.Query(ctx, sql)
   364  	if err != nil {
   365  		mgr.Logger.Error(err, fmt.Sprintf("query sql:%s failed", sql))
   366  		return err
   367  	}
   368  
   369  	resMap, err := postgres.ParseQuery(string(resp))
   370  	if err != nil {
   371  		return errors.Errorf("parse query response:%s failed, err:%v", string(resp), err)
   372  	}
   373  
   374  	currentLeaderAddr := strings.Split(cast.ToString(resMap[0]["ip_port"]), ":")[0]
   375  	promoteSQL := fmt.Sprintf(`alter system consensus CHANGE LEADER TO '%s:%d';`, cluster.GetMemberAddrWithName(mgr.CurrentMemberName), mgr.Config.GetDBPort())
   376  	_, err = mgr.ExecWithHost(ctx, promoteSQL, currentLeaderAddr)
   377  	if err != nil {
   378  		mgr.Logger.Error(err, fmt.Sprintf("exec sql:%s failed", sql))
   379  		return err
   380  	}
   381  
   382  	return nil
   383  }
   384  
   385  func (mgr *Manager) IsPromoted(ctx context.Context) bool {
   386  	isLeader, _ := mgr.IsLeader(ctx, nil)
   387  	return isLeader
   388  }
   389  
   390  func (mgr *Manager) HasOtherHealthyLeader(ctx context.Context, cluster *dcs.Cluster) *dcs.Member {
   391  	if isLeader, err := mgr.IsLeader(ctx, cluster); isLeader && err == nil {
   392  		// I am the leader, just return nil
   393  		return nil
   394  	}
   395  
   396  	// TODO:will get leader ip_port from consensus_member_status directly in the future
   397  	sql := `select ip_port from consensus_cluster_status where server_id = (select current_leader from consensus_member_status);`
   398  	resp, err := mgr.Query(ctx, sql)
   399  	if err != nil {
   400  		mgr.Logger.Error(err, fmt.Sprintf("query sql:%s failed", sql))
   401  		return nil
   402  	}
   403  
   404  	resMap, err := postgres.ParseQuery(string(resp))
   405  	if err != nil {
   406  		mgr.Logger.Error(err, "parse query response failed")
   407  		return nil
   408  	}
   409  
   410  	host := strings.Split(cast.ToString(resMap[0]["ip_port"]), ":")[0]
   411  	leaderName := strings.Split(host, ".")[0]
   412  	if len(leaderName) > 0 {
   413  		return cluster.GetMemberWithName(leaderName)
   414  	}
   415  
   416  	return nil
   417  }
   418  
   419  func (mgr *Manager) HasOtherHealthyMembers(ctx context.Context, cluster *dcs.Cluster, leader string) []*dcs.Member {
   420  	members := make([]*dcs.Member, 0)
   421  
   422  	for i, m := range cluster.Members {
   423  		if m.Name != leader && mgr.IsMemberHealthy(ctx, cluster, &m) {
   424  			members = append(members, &cluster.Members[i])
   425  		}
   426  	}
   427  
   428  	return members
   429  }