github.com/1aal/kubeblocks@v0.0.0-20231107070852-e1c03e598921/pkg/lorry/engines/mysql/manager.go (about) 1 /* 2 Copyright (C) 2022-2023 ApeCloud Co., Ltd 3 4 This file is part of KubeBlocks project 5 6 This program is free software: you can redistribute it and/or modify 7 it under the terms of the GNU Affero General Public License as published by 8 the Free Software Foundation, either version 3 of the License, or 9 (at your option) any later version. 10 11 This program is distributed in the hope that it will be useful 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU Affero General Public License for more details. 15 16 You should have received a copy of the GNU Affero General Public License 17 along with this program. If not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 package mysql 21 22 import ( 23 "context" 24 "database/sql" 25 "fmt" 26 "strings" 27 "time" 28 29 "github.com/go-sql-driver/mysql" 30 "github.com/pkg/errors" 31 ctrl "sigs.k8s.io/controller-runtime" 32 33 "github.com/1aal/kubeblocks/pkg/lorry/dcs" 34 "github.com/1aal/kubeblocks/pkg/lorry/engines" 35 ) 36 37 const ( 38 MysqlServiceType = "mysql" 39 ) 40 41 type Manager struct { 42 engines.DBManagerBase 43 DB *sql.DB 44 hostname string 45 serverID uint 46 version string 47 binlogFormat string 48 logbinEnabled bool 49 logReplicationUpdatesEnabled bool 50 opTimestamp int64 51 globalState map[string]string 52 masterStatus RowMap 53 slaveStatus RowMap 54 } 55 56 var _ engines.DBManager = &Manager{} 57 58 func NewManager(properties engines.Properties) (engines.DBManager, error) { 59 logger := ctrl.Log.WithName("MySQL") 60 config, err := NewConfig(properties) 61 if err != nil { 62 return nil, err 63 } 64 65 db, err := config.GetLocalDBConn() 66 if err != nil { 67 return nil, errors.Wrap(err, "connect to MySQL") 68 } 69 70 defer func() { 71 if err != nil { 72 derr := db.Close() 73 if derr != nil { 74 logger.Error(err, "failed to close") 75 } 76 } 77 }() 78 79 managerBase, err := engines.NewDBManagerBase(logger) 80 if err != nil { 81 return nil, err 82 } 83 84 serverID, err := engines.GetIndex(managerBase.CurrentMemberName) 85 if err != nil { 86 return nil, err 87 } 88 89 mgr := &Manager{ 90 DBManagerBase: *managerBase, 91 serverID: uint(serverID) + 1, 92 DB: db, 93 } 94 95 return mgr, nil 96 } 97 98 func (mgr *Manager) InitializeCluster(ctx context.Context, cluster *dcs.Cluster) error { 99 return nil 100 } 101 102 func (mgr *Manager) IsRunning() bool { 103 ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond) 104 defer cancel() 105 106 // test if db is ready to connect or not 107 err := mgr.DB.PingContext(ctx) 108 if err != nil { 109 if driverErr, ok := err.(*mysql.MySQLError); ok { 110 // Now the error number is accessible directly 111 if driverErr.Number == 1040 { 112 mgr.Logger.Error(err, "Too many connections") 113 return true 114 } 115 } 116 mgr.Logger.Error(err, "DB is not ready") 117 return false 118 } 119 120 return true 121 } 122 123 func (mgr *Manager) IsDBStartupReady() bool { 124 if mgr.DBStartupReady { 125 return true 126 } 127 ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond) 128 defer cancel() 129 130 // test if db is ready to connect or not 131 err := mgr.DB.PingContext(ctx) 132 if err != nil { 133 mgr.Logger.Error(err, "DB is not ready") 134 return false 135 } 136 137 mgr.DBStartupReady = true 138 mgr.Logger.Info("DB startup ready") 139 return true 140 } 141 142 func (mgr *Manager) IsReadonly(ctx context.Context, cluster *dcs.Cluster, member *dcs.Member) (bool, error) { 143 var db *sql.DB 144 var err error 145 if member != nil { 146 addr := cluster.GetMemberAddrWithPort(*member) 147 db, err = config.GetDBConnWithAddr(addr) 148 if err != nil { 149 mgr.Logger.Error(err, "Get Member conn failed") 150 return false, err 151 } 152 if db != nil { 153 defer db.Close() 154 } 155 } else { 156 db = mgr.DB 157 } 158 159 var readonly bool 160 err = db.QueryRowContext(ctx, "select @@global.hostname, @@global.version, "+ 161 "@@global.read_only, @@global.binlog_format, @@global.log_bin, @@global.log_slave_updates"). 162 Scan(&mgr.hostname, &mgr.version, &readonly, &mgr.binlogFormat, 163 &mgr.logbinEnabled, &mgr.logReplicationUpdatesEnabled) 164 if err != nil { 165 mgr.Logger.Error(err, "Get global readonly failed") 166 return false, err 167 } 168 return readonly, nil 169 } 170 171 func (mgr *Manager) IsLeader(ctx context.Context, cluster *dcs.Cluster) (bool, error) { 172 readonly, err := mgr.IsReadonly(ctx, nil, nil) 173 174 if err != nil || readonly { 175 return false, err 176 } 177 178 // if cluster.Leader != nil && cluster.Leader.Name != "" { 179 // if cluster.Leader.Name == mgr.CurrentMemberName { 180 // return true, nil 181 // } else { 182 // return false, nil 183 // } 184 // } 185 186 // // During the initialization of cluster, there would be more than one leader, 187 // // in this case, the first member is chosen as the leader 188 // if mgr.CurrentMemberName == cluster.Members[0].Name { 189 // return true, nil 190 // } 191 // isFirstMemberLeader, err := mgr.IsLeaderMember(ctx, cluster, &cluster.Members[0]) 192 // if err == nil && isFirstMemberLeader { 193 // return false, nil 194 // } 195 196 return true, err 197 } 198 199 func (mgr *Manager) IsLeaderMember(ctx context.Context, cluster *dcs.Cluster, member *dcs.Member) (bool, error) { 200 readonly, err := mgr.IsReadonly(ctx, cluster, member) 201 if err != nil || readonly { 202 return false, err 203 } 204 205 return true, err 206 } 207 208 func (mgr *Manager) InitiateCluster(cluster *dcs.Cluster) error { 209 return nil 210 } 211 212 func (mgr *Manager) GetMemberAddrs(ctx context.Context, cluster *dcs.Cluster) []string { 213 return cluster.GetMemberAddrs() 214 } 215 216 func (mgr *Manager) GetLeaderClient(ctx context.Context, cluster *dcs.Cluster) (*sql.DB, error) { 217 leaderMember := cluster.GetLeaderMember() 218 if leaderMember == nil { 219 return nil, fmt.Errorf("cluster has no leader") 220 } 221 222 addr := cluster.GetMemberAddrWithPort(*leaderMember) 223 return config.GetDBConnWithAddr(addr) 224 } 225 226 func (mgr *Manager) IsCurrentMemberInCluster(ctx context.Context, cluster *dcs.Cluster) bool { 227 return true 228 } 229 230 func (mgr *Manager) IsCurrentMemberHealthy(ctx context.Context, cluster *dcs.Cluster) bool { 231 // _, _ = mgr.EnsureServerID(ctx) 232 member := cluster.GetMemberWithName(mgr.CurrentMemberName) 233 234 return mgr.IsMemberHealthy(ctx, cluster, member) 235 } 236 237 func (mgr *Manager) IsMemberLagging(ctx context.Context, cluster *dcs.Cluster, member *dcs.Member) (bool, int64) { 238 var db *sql.DB 239 var err error 240 var leaderDBState *dcs.DBState 241 if cluster.Leader == nil || cluster.Leader.DBState == nil { 242 mgr.Logger.Info("No leader DBstate info") 243 return false, 0 244 } 245 leaderDBState = cluster.Leader.DBState 246 247 if member != nil && member.Name != mgr.CurrentMemberName { 248 addr := cluster.GetMemberAddrWithPort(*member) 249 db, err = config.GetDBConnWithAddr(addr) 250 if err != nil { 251 mgr.Logger.Error(err, "Get Member conn failed") 252 return false, 0 253 } 254 if db != nil { 255 defer db.Close() 256 } 257 } else { 258 db = mgr.DB 259 } 260 opTimestamp, err := mgr.GetOpTimestamp(ctx, db) 261 if err != nil { 262 mgr.Logger.Error(err, "get op timestamp failed") 263 return false, 0 264 } 265 266 if leaderDBState.OpTimestamp-opTimestamp <= cluster.HaConfig.GetMaxLagOnSwitchover() { 267 return false, 0 268 } 269 mgr.Logger.Info(fmt.Sprintf("The member %s has lag: %d", member.Name, leaderDBState.OpTimestamp-opTimestamp)) 270 return true, leaderDBState.OpTimestamp - opTimestamp 271 } 272 273 func (mgr *Manager) IsMemberHealthy(ctx context.Context, cluster *dcs.Cluster, member *dcs.Member) bool { 274 var db *sql.DB 275 var err error 276 if member != nil && member.Name != mgr.CurrentMemberName { 277 addr := cluster.GetMemberAddrWithPort(*member) 278 db, err = config.GetDBConnWithAddr(addr) 279 if err != nil { 280 mgr.Logger.Error(err, "Get Member conn failed") 281 return false 282 } 283 if db != nil { 284 defer db.Close() 285 } 286 } else { 287 db = mgr.DB 288 } 289 290 if cluster.Leader != nil && cluster.Leader.Name == member.Name { 291 if !mgr.WriteCheck(ctx, db) { 292 return false 293 } 294 } 295 if !mgr.ReadCheck(ctx, db) { 296 return false 297 } 298 299 return true 300 } 301 302 func (mgr *Manager) GetDBState(ctx context.Context, cluster *dcs.Cluster) *dcs.DBState { 303 mgr.DBState = nil 304 305 globalState, err := mgr.GetGlobalState(ctx, mgr.DB) 306 if err != nil { 307 mgr.Logger.Error(err, "select global failed") 308 return nil 309 } 310 311 masterStatus, err := mgr.GetMasterStatus(ctx, mgr.DB) 312 if err != nil { 313 mgr.Logger.Error(err, "show master status failed") 314 return nil 315 } 316 317 slaveStatus, err := mgr.GetSlaveStatus(ctx, mgr.DB) 318 if err != nil { 319 mgr.Logger.Error(err, "show slave status failed") 320 return nil 321 } 322 323 opTimestamp, err := mgr.GetOpTimestamp(ctx, mgr.DB) 324 if err != nil { 325 mgr.Logger.Error(err, "get op timestamp failed") 326 return nil 327 } 328 329 dbState := &dcs.DBState{ 330 OpTimestamp: opTimestamp, 331 Extra: map[string]string{}, 332 } 333 for k, v := range globalState { 334 dbState.Extra[k] = v 335 } 336 337 if cluster.Leader != nil && cluster.Leader.Name == mgr.CurrentMemberName { 338 dbState.Extra["Binlog_File"] = masterStatus.GetString("File") 339 dbState.Extra["Binlog_Pos"] = masterStatus.GetString("Pos") 340 } else { 341 dbState.Extra["Master_Host"] = slaveStatus.GetString("Master_Host") 342 dbState.Extra["Master_UUID"] = slaveStatus.GetString("Master_UUID") 343 dbState.Extra["Slave_IO_Running"] = slaveStatus.GetString("Slave_IO_Running") 344 dbState.Extra["Slave_SQL_Running"] = slaveStatus.GetString("Slave_SQL_Running") 345 dbState.Extra["Last_IO_Error"] = slaveStatus.GetString("Last_IO_Error") 346 dbState.Extra["Last_SQL_Error"] = slaveStatus.GetString("Last_SQL_Error") 347 dbState.Extra["Master_Log_File"] = slaveStatus.GetString("Master_Log_File") 348 dbState.Extra["Read_Master_Log_Pos"] = slaveStatus.GetString("Read_Master_Log_Pos") 349 dbState.Extra["Relay_Master_Log_File"] = slaveStatus.GetString("Relay_Master_Log_File") 350 dbState.Extra["Exec_Master_Log_Pos"] = slaveStatus.GetString("Exec_Master_Log_Pos") 351 } 352 353 mgr.globalState = globalState 354 mgr.masterStatus = masterStatus 355 mgr.slaveStatus = slaveStatus 356 mgr.opTimestamp = opTimestamp 357 mgr.DBState = dbState 358 359 return dbState 360 } 361 362 func (mgr *Manager) WriteCheck(ctx context.Context, db *sql.DB) bool { 363 writeSQL := fmt.Sprintf(`BEGIN; 364 CREATE DATABASE IF NOT EXISTS kubeblocks; 365 CREATE TABLE IF NOT EXISTS kubeblocks.kb_health_check(type INT, check_ts BIGINT, PRIMARY KEY(type)); 366 INSERT INTO kubeblocks.kb_health_check VALUES(%d, UNIX_TIMESTAMP()) ON DUPLICATE KEY UPDATE check_ts = UNIX_TIMESTAMP(); 367 COMMIT;`, engines.CheckStatusType) 368 _, err := db.ExecContext(ctx, writeSQL) 369 if err != nil { 370 mgr.Logger.Error(err, fmt.Sprintf("SQL %s executing failed", writeSQL)) 371 return false 372 } 373 return true 374 } 375 376 func (mgr *Manager) ReadCheck(ctx context.Context, db *sql.DB) bool { 377 _, err := mgr.GetOpTimestamp(ctx, db) 378 if err != nil { 379 if err == sql.ErrNoRows { 380 // no healthy check records, return true 381 return true 382 } 383 mysqlErr, ok := err.(*mysql.MySQLError) 384 if ok && (mysqlErr.Number == 1049 || mysqlErr.Number == 1146) { 385 // error 1049: database does not exists 386 // error 1146: table does not exists 387 // no healthy database, return true 388 return true 389 } 390 mgr.Logger.Error(err, "Read check failed") 391 return false 392 } 393 394 return true 395 } 396 397 func (mgr *Manager) GetOpTimestamp(ctx context.Context, db *sql.DB) (int64, error) { 398 readSQL := fmt.Sprintf(`select check_ts from kubeblocks.kb_health_check where type=%d limit 1;`, engines.CheckStatusType) 399 var opTimestamp int64 400 err := db.QueryRowContext(ctx, readSQL).Scan(&opTimestamp) 401 return opTimestamp, err 402 } 403 404 func (mgr *Manager) GetGlobalState(ctx context.Context, db *sql.DB) (map[string]string, error) { 405 var hostname, serverUUID, gtidExecuted, gtidPurged, isReadonly, superReadonly string 406 err := db.QueryRowContext(ctx, "select @@global.hostname, @@global.server_uuid, @@global.gtid_executed, @@global.gtid_purged, @@global.read_only, @@global.super_read_only"). 407 Scan(&hostname, &serverUUID, >idExecuted, >idPurged, &isReadonly, &superReadonly) 408 if err != nil { 409 return nil, err 410 } 411 412 return map[string]string{ 413 "hostname": hostname, 414 "server_uuid": serverUUID, 415 "gtid_executed": gtidExecuted, 416 "gtid_purged": gtidPurged, 417 "read_only": isReadonly, 418 "super_read_only": superReadonly, 419 }, nil 420 } 421 422 func (mgr *Manager) GetSlaveStatus(ctx context.Context, db *sql.DB) (RowMap, error) { 423 sql := "show slave status" 424 var rowMap RowMap 425 426 err := QueryRowsMap(mgr.DB, sql, func(rMap RowMap) error { 427 rowMap = rMap 428 return nil 429 }) 430 if err != nil { 431 mgr.Logger.Error(err, "error executing %s") 432 return nil, err 433 } 434 return rowMap, nil 435 } 436 437 func (mgr *Manager) GetMasterStatus(ctx context.Context, db *sql.DB) (RowMap, error) { 438 sql := "show master status" 439 var rowMap RowMap 440 441 err := QueryRowsMap(mgr.DB, sql, func(rMap RowMap) error { 442 rowMap = rMap 443 return nil 444 }) 445 if err != nil { 446 mgr.Logger.Error(err, fmt.Sprintf("error executing %s", sql)) 447 return nil, err 448 } 449 return rowMap, nil 450 } 451 452 func (mgr *Manager) Recover(context.Context) error { 453 return nil 454 } 455 456 func (mgr *Manager) JoinCurrentMemberToCluster(ctx context.Context, cluster *dcs.Cluster) error { 457 return nil 458 } 459 460 func (mgr *Manager) LeaveMemberFromCluster(context.Context, *dcs.Cluster, string) error { 461 return nil 462 } 463 464 // func (mgr *Manager) IsClusterHealthy(ctx context.Context, cluster *dcs.Cluster) bool { 465 // leaderMember := cluster.GetLeaderMember() 466 // if leaderMember == nil { 467 // mgr.Logger.Infof("IsClusterHealthy: has no leader.") 468 // return true 469 // } 470 471 // return mgr.IsMemberHealthy(ctx, cluster, leaderMember) 472 // } 473 474 // IsClusterInitialized is a method to check if cluster is initailized or not 475 func (mgr *Manager) IsClusterInitialized(ctx context.Context, cluster *dcs.Cluster) (bool, error) { 476 return mgr.EnsureServerID(ctx) 477 } 478 479 func (mgr *Manager) EnsureServerID(ctx context.Context) (bool, error) { 480 var serverID uint 481 err := mgr.DB.QueryRowContext(ctx, "select @@global.server_id").Scan(&serverID) 482 if err != nil { 483 mgr.Logger.Info("Get global server id failed", "error", err) 484 return false, err 485 } 486 if serverID == mgr.serverID { 487 return true, nil 488 } 489 mgr.Logger.Info("Set global server id", "server_id", serverID) 490 491 setServerID := fmt.Sprintf(`set global server_id = %d`, mgr.serverID) 492 mgr.Logger.Info("Set global server id", "server-id", setServerID) 493 _, err = mgr.DB.Exec(setServerID) 494 if err != nil { 495 mgr.Logger.Info("set server id failed", "error", err) 496 return false, err 497 } 498 499 return true, nil 500 } 501 502 func (mgr *Manager) Promote(ctx context.Context, cluster *dcs.Cluster) error { 503 if (mgr.globalState["super_read_only"] == "0" && mgr.globalState["read_only"] == "0") && 504 (len(mgr.slaveStatus) == 0 || (mgr.slaveStatus.GetString("Slave_IO_Running") == "No" && 505 mgr.slaveStatus.GetString("Slave_SQL_Running") == "No")) { 506 return nil 507 } 508 stopReadOnly := `set global read_only=off;set global super_read_only=off;` 509 stopSlave := `stop slave;` 510 resp, err := mgr.DB.Exec(stopReadOnly + stopSlave) 511 if err != nil { 512 mgr.Logger.Error(err, "promote err") 513 return err 514 } 515 516 mgr.Logger.Info(fmt.Sprintf("promote success, resp:%v", resp)) 517 return nil 518 } 519 520 func (mgr *Manager) Demote(context.Context) error { 521 setReadOnly := `set global read_only=on;set global super_read_only=on;` 522 523 _, err := mgr.DB.Exec(setReadOnly) 524 if err != nil { 525 mgr.Logger.Error(err, "demote err") 526 return err 527 } 528 return nil 529 } 530 531 func (mgr *Manager) Follow(ctx context.Context, cluster *dcs.Cluster) error { 532 leaderMember := cluster.GetLeaderMember() 533 if leaderMember == nil { 534 return fmt.Errorf("cluster has no leader") 535 } 536 537 if mgr.CurrentMemberName == cluster.Leader.Name { 538 mgr.Logger.Info("i get the leader key, don't need to follow") 539 return nil 540 } 541 542 if !mgr.isRecoveryConfOutdate(ctx, cluster.Leader.Name) { 543 return nil 544 } 545 546 stopSlave := `stop slave;` 547 changeMaster := fmt.Sprintf(`change master to master_host='%s',master_user='%s',master_password='%s',master_port=%s,master_auto_position=1;`, 548 cluster.GetMemberAddr(*leaderMember), config.username, config.password, leaderMember.DBPort) 549 startSlave := `start slave;` 550 551 _, err := mgr.DB.Exec(stopSlave + changeMaster + startSlave) 552 if err != nil { 553 mgr.Logger.Error(err, "sql query failed, err") 554 } 555 556 mgr.Logger.Info("successfully follow new leader", "leader-name", leaderMember.Name) 557 return nil 558 } 559 560 func (mgr *Manager) isRecoveryConfOutdate(ctx context.Context, leader string) bool { 561 var rowMap = mgr.slaveStatus 562 563 if len(rowMap) == 0 { 564 return true 565 } 566 567 ioError := rowMap.GetString("Last_IO_Error") 568 sqlError := rowMap.GetString("Last_SQL_Error") 569 if ioError != "" || sqlError != "" { 570 mgr.Logger.Error(nil, fmt.Sprintf("slave status error, sqlError: %s, ioError: %s", sqlError, ioError)) 571 return true 572 } 573 574 masterHost := rowMap.GetString("Master_Host") 575 return !strings.HasPrefix(masterHost, leader) 576 } 577 578 func (mgr *Manager) GetHealthiestMember(cluster *dcs.Cluster, candidate string) *dcs.Member { 579 return nil 580 } 581 582 // func (mgr *Manager) HasOtherHealthyLeader(ctx context.Context, cluster *dcs.Cluster) *dcs.Member { 583 // return nil 584 // isLeader, err := mgr.IsLeader(ctx, cluster) 585 // if err == nil && isLeader { 586 // // if current member is leader, just return 587 // return nil 588 // } 589 590 // for _, member := range cluster.Members { 591 // if member.Name == mgr.CurrentMemberName { 592 // continue 593 // } 594 595 // isLeader, err := mgr.IsLeaderMember(ctx, cluster, &member) 596 // if err == nil && isLeader { 597 // return &member 598 // } 599 // } 600 601 // return nil 602 // } 603 604 // HasOtherHealthyMembers checks if there are any healthy members, excluding the leader 605 func (mgr *Manager) HasOtherHealthyMembers(ctx context.Context, cluster *dcs.Cluster, leader string) []*dcs.Member { 606 members := make([]*dcs.Member, 0) 607 for _, member := range cluster.Members { 608 if member.Name == leader { 609 continue 610 } 611 if !mgr.IsMemberHealthy(ctx, cluster, &member) { 612 continue 613 } 614 members = append(members, &member) 615 } 616 617 return members 618 } 619 620 func (mgr *Manager) IsRootCreated(ctx context.Context) (bool, error) { 621 return true, nil 622 } 623 624 func (mgr *Manager) CreateRoot(ctx context.Context) error { 625 return nil 626 } 627 628 func (mgr *Manager) Lock(ctx context.Context, reason string) error { 629 setReadOnly := `set global read_only=on;` 630 631 _, err := mgr.DB.Exec(setReadOnly) 632 if err != nil { 633 mgr.Logger.Error(err, "Lock err") 634 return err 635 } 636 mgr.IsLocked = true 637 return nil 638 } 639 640 func (mgr *Manager) Unlock(ctx context.Context) error { 641 setReadOnlyOff := `set global read_only=off;` 642 643 _, err := mgr.DB.Exec(setReadOnlyOff) 644 if err != nil { 645 mgr.Logger.Error(err, "Unlock err") 646 return err 647 } 648 mgr.IsLocked = false 649 return nil 650 }