github.com/1aal/kubeblocks@v0.0.0-20231107070852-e1c03e598921/pkg/lorry/engines/postgres/apecloudpostgres/manager.go (about) 1 /* 2 Copyright (C) 2022-2023 ApeCloud Co., Ltd 3 4 This file is part of KubeBlocks project 5 6 This program is free software: you can redistribute it and/or modify 7 it under the terms of the GNU Affero General Public License as published by 8 the Free Software Foundation, either version 3 of the License, or 9 (at your option) any later version. 10 11 This program is distributed in the hope that it will be useful 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU Affero General Public License for more details. 15 16 You should have received a copy of the GNU Affero General Public License 17 along with this program. If not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 package apecloudpostgres 21 22 import ( 23 "context" 24 "fmt" 25 "strings" 26 "time" 27 28 "github.com/pkg/errors" 29 "github.com/spf13/cast" 30 "golang.org/x/exp/slices" 31 32 "github.com/1aal/kubeblocks/pkg/lorry/dcs" 33 "github.com/1aal/kubeblocks/pkg/lorry/engines" 34 "github.com/1aal/kubeblocks/pkg/lorry/engines/models" 35 "github.com/1aal/kubeblocks/pkg/lorry/engines/postgres" 36 ) 37 38 type Manager struct { 39 postgres.Manager 40 memberAddrs []string 41 healthStatus *postgres.ConsensusMemberHealthStatus 42 } 43 44 var _ engines.DBManager = &Manager{} 45 46 var Mgr *Manager 47 48 func NewManager(properties engines.Properties) (engines.DBManager, error) { 49 Mgr = &Manager{} 50 51 baseManager, err := postgres.NewManager(properties) 52 if err != nil { 53 return nil, errors.Errorf("new base manager failed, err: %v", err) 54 } 55 56 Mgr.Manager = *baseManager.(*postgres.Manager) 57 return Mgr, nil 58 } 59 60 func (mgr *Manager) GetDBState(ctx context.Context, cluster *dcs.Cluster) *dcs.DBState { 61 mgr.DBState = nil 62 mgr.UnsetIsLeader() 63 dbState := &dcs.DBState{ 64 Extra: map[string]string{}, 65 } 66 67 isLeader, err := mgr.IsLeader(ctx, cluster) 68 if err != nil { 69 mgr.Logger.Error(err, "check is leader failed") 70 return nil 71 } 72 mgr.SetIsLeader(isLeader) 73 74 memberAddrs := mgr.GetMemberAddrs(ctx, cluster) 75 if memberAddrs == nil { 76 mgr.Logger.Error(nil, "get member addrs failed") 77 return nil 78 } 79 mgr.memberAddrs = memberAddrs 80 81 healthStatus, err := mgr.getMemberHealthStatus(ctx, cluster, cluster.GetMemberWithName(mgr.CurrentMemberName)) 82 if err != nil { 83 mgr.Logger.Error(err, "get member health status failed") 84 return nil 85 } 86 mgr.healthStatus = healthStatus 87 88 mgr.DBState = dbState 89 return dbState 90 } 91 92 func (mgr *Manager) IsLeader(ctx context.Context, cluster *dcs.Cluster) (bool, error) { 93 isSet, isLeader := mgr.GetIsLeader() 94 if isSet { 95 return isLeader, nil 96 } 97 98 return mgr.IsLeaderWithHost(ctx, "") 99 } 100 101 func (mgr *Manager) IsLeaderWithHost(ctx context.Context, host string) (bool, error) { 102 role, err := mgr.GetMemberRoleWithHost(ctx, host) 103 if err != nil { 104 return false, errors.Errorf("check is leader with host:%s failed, err:%v", host, err) 105 } 106 107 return role == models.LEADER, nil 108 } 109 110 func (mgr *Manager) IsDBStartupReady() bool { 111 ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond) 112 defer cancel() 113 if mgr.DBStartupReady { 114 return true 115 } 116 117 if !mgr.IsPgReady(ctx) { 118 return false 119 } 120 121 if !mgr.isConsensusReadyUp(ctx) { 122 return false 123 } 124 125 mgr.DBStartupReady = true 126 mgr.Logger.Info("DB startup ready") 127 return true 128 } 129 130 func (mgr *Manager) isConsensusReadyUp(ctx context.Context) bool { 131 sql := `SELECT extname FROM pg_extension WHERE extname = 'consensus_monitor';` 132 resp, err := mgr.Query(ctx, sql) 133 if err != nil { 134 mgr.Logger.Error(err, fmt.Sprintf("query sql:%s failed", sql)) 135 return false 136 } 137 138 resMap, err := postgres.ParseQuery(string(resp)) 139 if err != nil { 140 mgr.Logger.Error(err, fmt.Sprintf("parse query response:%s failed", string(resp))) 141 return false 142 } 143 144 return resMap[0]["extname"] != nil 145 } 146 147 func (mgr *Manager) IsClusterInitialized(ctx context.Context, cluster *dcs.Cluster) (bool, error) { 148 if !mgr.IsFirstMember() { 149 mgr.Logger.Info("I am not the first member, just skip and wait for the first member to initialize the cluster.") 150 return true, nil 151 } 152 153 if !mgr.IsDBStartupReady() { 154 return false, nil 155 } 156 157 sql := `SELECT usename FROM pg_user WHERE usename = 'replicator';` 158 resp, err := mgr.Query(ctx, sql) 159 if err != nil { 160 mgr.Logger.Error(err, fmt.Sprintf("query sql:%s failed", sql)) 161 return false, err 162 } 163 164 resMap, err := postgres.ParseQuery(string(resp)) 165 if err != nil { 166 mgr.Logger.Error(err, fmt.Sprintf("parse query response:%s failed", string(resp))) 167 return false, err 168 } 169 170 return resMap[0]["usename"] != nil, nil 171 } 172 173 func (mgr *Manager) InitializeCluster(ctx context.Context, cluster *dcs.Cluster) error { 174 sql := "create role replicator with superuser login password 'replicator';" + 175 "create extension if not exists consensus_monitor;" 176 177 _, err := mgr.Exec(ctx, sql) 178 return err 179 } 180 181 func (mgr *Manager) GetMemberRoleWithHost(ctx context.Context, host string) (string, error) { 182 sql := `select paxos_role from consensus_member_status;` 183 184 resp, err := mgr.QueryWithHost(ctx, sql, host) 185 if err != nil { 186 mgr.Logger.Error(err, fmt.Sprintf("query sql:%s failed", sql)) 187 return "", err 188 } 189 190 resMap, err := postgres.ParseQuery(string(resp)) 191 if err != nil { 192 mgr.Logger.Error(err, fmt.Sprintf("parse query response:%s failed", string(resp))) 193 return "", err 194 } 195 196 // TODO:paxos roles are currently represented by numbers, will change to string in the future 197 var role string 198 switch cast.ToInt(resMap[0]["paxos_role"]) { 199 case 0: 200 role = models.FOLLOWER 201 case 1: 202 role = models.CANDIDATE 203 case 2: 204 role = models.LEADER 205 case 3: 206 role = models.LEARNER 207 default: 208 mgr.Logger.Info(fmt.Sprintf("get invalid role number:%d", cast.ToInt(resMap[0]["paxos_role"]))) 209 role = "" 210 } 211 212 return role, nil 213 } 214 215 func (mgr *Manager) GetMemberAddrs(ctx context.Context, cluster *dcs.Cluster) []string { 216 if mgr.DBState != nil && mgr.memberAddrs != nil { 217 return mgr.memberAddrs 218 } 219 220 sql := `select ip_port from consensus_cluster_status;` 221 resp, err := mgr.QueryLeader(ctx, sql, cluster) 222 if err != nil { 223 mgr.Logger.Error(err, fmt.Sprintf("query %s with leader failed", sql)) 224 return nil 225 } 226 227 result, err := postgres.ParseQuery(string(resp)) 228 if err != nil { 229 mgr.Logger.Error(err, fmt.Sprintf("parse query response:%s failed", string(resp))) 230 return nil 231 } 232 233 var addrs []string 234 for _, m := range result { 235 addrs = append(addrs, strings.Split(cast.ToString(m["ip_port"]), ":")[0]) 236 } 237 238 return addrs 239 } 240 241 func (mgr *Manager) IsCurrentMemberInCluster(ctx context.Context, cluster *dcs.Cluster) bool { 242 memberAddrs := mgr.GetMemberAddrs(ctx, cluster) 243 // AddCurrentMemberToCluster is executed only when memberAddrs are successfully obtained and memberAddrs not Contains CurrentMember 244 if memberAddrs != nil && !slices.Contains(memberAddrs, cluster.GetMemberAddrWithName(mgr.CurrentMemberName)) { 245 return false 246 } 247 return true 248 } 249 250 func (mgr *Manager) IsCurrentMemberHealthy(ctx context.Context, cluster *dcs.Cluster) bool { 251 return mgr.IsMemberHealthy(ctx, cluster, cluster.GetMemberWithName(mgr.CurrentMemberName)) 252 } 253 254 // IsMemberHealthy firstly get the leader's connection pool, 255 // because only leader can get the cluster healthy view 256 func (mgr *Manager) IsMemberHealthy(ctx context.Context, cluster *dcs.Cluster, member *dcs.Member) bool { 257 healthStatus, err := mgr.getMemberHealthStatus(ctx, cluster, member) 258 if errors.Is(err, postgres.ClusterHasNoLeader) { 259 mgr.Logger.Info("cluster has no leader, will compete the leader lock") 260 return true 261 } else if err != nil { 262 mgr.Logger.Error(err, "check member healthy failed") 263 return false 264 } 265 266 return healthStatus.Connected 267 } 268 269 func (mgr *Manager) getMemberHealthStatus(ctx context.Context, cluster *dcs.Cluster, member *dcs.Member) (*postgres.ConsensusMemberHealthStatus, error) { 270 if mgr.DBState != nil && mgr.healthStatus != nil { 271 return mgr.healthStatus, nil 272 } 273 res := &postgres.ConsensusMemberHealthStatus{} 274 275 IPPort := mgr.Config.GetConsensusIPPort(cluster, member.Name) 276 sql := fmt.Sprintf(`select connected, log_delay_num from consensus_cluster_health where ip_port = '%s';`, IPPort) 277 resp, err := mgr.QueryLeader(ctx, sql, cluster) 278 if err != nil { 279 return nil, err 280 } 281 282 resMap, err := postgres.ParseQuery(string(resp)) 283 if err != nil { 284 return nil, err 285 } 286 287 if resMap[0]["connected"] != nil { 288 res.Connected = cast.ToBool(resMap[0]["connected"]) 289 } 290 if resMap[0]["log_delay_num"] != nil { 291 res.LogDelayNum = cast.ToInt64(resMap[0]["log_delay_num"]) 292 } 293 294 return res, nil 295 } 296 297 func (mgr *Manager) IsMemberLagging(ctx context.Context, cluster *dcs.Cluster, member *dcs.Member) (bool, int64) { 298 healthStatus, err := mgr.getMemberHealthStatus(ctx, cluster, member) 299 if errors.Is(err, postgres.ClusterHasNoLeader) { 300 mgr.Logger.Info("cluster has no leader, so member has no lag") 301 return false, 0 302 } else if err != nil { 303 mgr.Logger.Error(err, "check member lag failed") 304 return true, cluster.HaConfig.GetMaxLagOnSwitchover() + 1 305 } 306 307 return healthStatus.LogDelayNum > cluster.HaConfig.GetMaxLagOnSwitchover(), healthStatus.LogDelayNum 308 } 309 310 func (mgr *Manager) JoinCurrentMemberToCluster(ctx context.Context, cluster *dcs.Cluster) error { 311 sql := fmt.Sprintf(`alter system consensus add follower '%s:%d';`, 312 cluster.GetMemberAddrWithName(mgr.CurrentMemberName), mgr.Config.GetDBPort()) 313 314 _, err := mgr.ExecLeader(ctx, sql, cluster) 315 if err != nil { 316 mgr.Logger.Error(err, fmt.Sprintf("exec sql:%s failed", sql)) 317 return err 318 } 319 320 return nil 321 } 322 323 func (mgr *Manager) LeaveMemberFromCluster(ctx context.Context, cluster *dcs.Cluster, host string) error { 324 sql := fmt.Sprintf(`alter system consensus drop follower '%s:%d';`, 325 host, mgr.Config.GetDBPort()) 326 327 // only leader can delete member, so don't need to get pool 328 _, err := mgr.ExecWithHost(ctx, sql, "") 329 if err != nil { 330 mgr.Logger.Error(err, fmt.Sprintf("exec sql:%s failed", sql)) 331 return err 332 } 333 334 return nil 335 } 336 337 // IsClusterHealthy considers the health status of the cluster equivalent to the health status of the leader 338 func (mgr *Manager) IsClusterHealthy(ctx context.Context, cluster *dcs.Cluster) bool { 339 leaderMember := cluster.GetLeaderMember() 340 if leaderMember == nil { 341 mgr.Logger.Info("cluster has no leader, wait for leader to take the lock") 342 // when cluster has no leader, the health status of the cluster is assumed to be true by default, 343 // in order to proceed with the logic of competing for the leader lock 344 return true 345 } 346 347 if leaderMember.Name == mgr.CurrentMemberName { 348 // if the member is leader, then its health status will check in IsMemberHealthy later 349 return true 350 } 351 352 return mgr.IsMemberHealthy(ctx, cluster, leaderMember) 353 } 354 355 func (mgr *Manager) Promote(ctx context.Context, cluster *dcs.Cluster) error { 356 if isLeader, err := mgr.IsLeader(ctx, nil); isLeader && err == nil { 357 mgr.Logger.Info("i am already the leader, don't need to promote") 358 return nil 359 } 360 361 // TODO:will get leader ip_port from consensus_member_status directly in the future 362 sql := `select ip_port from consensus_cluster_status where server_id = (select current_leader from consensus_member_status);` 363 resp, err := mgr.Query(ctx, sql) 364 if err != nil { 365 mgr.Logger.Error(err, fmt.Sprintf("query sql:%s failed", sql)) 366 return err 367 } 368 369 resMap, err := postgres.ParseQuery(string(resp)) 370 if err != nil { 371 return errors.Errorf("parse query response:%s failed, err:%v", string(resp), err) 372 } 373 374 currentLeaderAddr := strings.Split(cast.ToString(resMap[0]["ip_port"]), ":")[0] 375 promoteSQL := fmt.Sprintf(`alter system consensus CHANGE LEADER TO '%s:%d';`, cluster.GetMemberAddrWithName(mgr.CurrentMemberName), mgr.Config.GetDBPort()) 376 _, err = mgr.ExecWithHost(ctx, promoteSQL, currentLeaderAddr) 377 if err != nil { 378 mgr.Logger.Error(err, fmt.Sprintf("exec sql:%s failed", sql)) 379 return err 380 } 381 382 return nil 383 } 384 385 func (mgr *Manager) IsPromoted(ctx context.Context) bool { 386 isLeader, _ := mgr.IsLeader(ctx, nil) 387 return isLeader 388 } 389 390 func (mgr *Manager) HasOtherHealthyLeader(ctx context.Context, cluster *dcs.Cluster) *dcs.Member { 391 if isLeader, err := mgr.IsLeader(ctx, cluster); isLeader && err == nil { 392 // I am the leader, just return nil 393 return nil 394 } 395 396 // TODO:will get leader ip_port from consensus_member_status directly in the future 397 sql := `select ip_port from consensus_cluster_status where server_id = (select current_leader from consensus_member_status);` 398 resp, err := mgr.Query(ctx, sql) 399 if err != nil { 400 mgr.Logger.Error(err, fmt.Sprintf("query sql:%s failed", sql)) 401 return nil 402 } 403 404 resMap, err := postgres.ParseQuery(string(resp)) 405 if err != nil { 406 mgr.Logger.Error(err, "parse query response failed") 407 return nil 408 } 409 410 host := strings.Split(cast.ToString(resMap[0]["ip_port"]), ":")[0] 411 leaderName := strings.Split(host, ".")[0] 412 if len(leaderName) > 0 { 413 return cluster.GetMemberWithName(leaderName) 414 } 415 416 return nil 417 } 418 419 func (mgr *Manager) HasOtherHealthyMembers(ctx context.Context, cluster *dcs.Cluster, leader string) []*dcs.Member { 420 members := make([]*dcs.Member, 0) 421 422 for i, m := range cluster.Members { 423 if m.Name != leader && mgr.IsMemberHealthy(ctx, cluster, &m) { 424 members = append(members, &cluster.Members[i]) 425 } 426 } 427 428 return members 429 }