vitess.io/vitess@v0.16.2/go/vt/vtgr/db/mysql.go (about) 1 /* 2 Copyright 2021 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package db 18 19 import ( 20 "errors" 21 "fmt" 22 "math" 23 "strconv" 24 "strings" 25 26 gouuid "github.com/google/uuid" 27 "github.com/spf13/pflag" 28 29 "vitess.io/vitess/go/vt/external/golib/sqlutils" 30 31 "vitess.io/vitess/go/mysql" 32 "vitess.io/vitess/go/vt/log" 33 "vitess.io/vitess/go/vt/servenv" 34 "vitess.io/vitess/go/vt/vtgr/config" 35 "vitess.io/vitess/go/vt/vtgr/inst" 36 ) 37 38 var ( 39 configFilePath string 40 dbFlavor = "MySQL56" 41 mysqlGroupPort = 33061 42 enableHeartbeatCheck bool 43 44 // ErrGroupSplitBrain is the error when mysql group is split-brain 45 ErrGroupSplitBrain = errors.New("group has split brain") 46 // ErrGroupBackoffError is either the transient error or network partition from the group 47 ErrGroupBackoffError = errors.New("group backoff error") 48 // ErrGroupOngoingBootstrap is the error when a bootstrap is in progress 49 ErrGroupOngoingBootstrap = errors.New("group ongoing bootstrap") 50 // ErrGroupInactive is the error when mysql group is inactive unexpectedly 51 ErrGroupInactive = errors.New("group is inactive") 52 // ErrInvalidInstance is the error when the instance key has empty hostname 53 ErrInvalidInstance = errors.New("invalid mysql instance key") 54 ) 55 56 func init() { 57 servenv.OnParseFor("vtgr", func(fs *pflag.FlagSet) { 58 fs.StringVar(&configFilePath, "db_config", "", "Full path to db config file that will be used by VTGR.") 59 fs.StringVar(&dbFlavor, "db_flavor", "MySQL56", "MySQL flavor override.") 60 fs.IntVar(&mysqlGroupPort, "gr_port", 33061, "Port to bootstrap a MySQL group.") 61 fs.BoolVar(&enableHeartbeatCheck, "enable_heartbeat_check", false, "Enable heartbeat checking, set together with --group_heartbeat_threshold.") 62 }) 63 } 64 65 // Agent is used by vtgr to interact with Mysql 66 type Agent interface { 67 // BootstrapGroupLocked bootstraps a mysql group 68 // the caller should grab a lock before 69 BootstrapGroupLocked(instanceKey *inst.InstanceKey) error 70 71 // RebootstrapGroupLocked rebootstrap a group with an existing name 72 RebootstrapGroupLocked(instanceKey *inst.InstanceKey, name string) error 73 74 // StopGroupLocked stops a mysql group 75 StopGroupLocked(instanceKey *inst.InstanceKey) error 76 77 // JoinGroupLocked puts an instance into a mysql group based on primary instance 78 // the caller should grab a lock before 79 JoinGroupLocked(instanceKey *inst.InstanceKey, primaryKey *inst.InstanceKey) error 80 81 // SetReadOnly set super_read_only variable 82 // https://dev.mysql.com/doc/refman/8.0/en/server-system-variables.html#sysvar_super_read_only 83 SetReadOnly(instanceKey *inst.InstanceKey, readOnly bool) error 84 85 // FetchApplierGTIDSet fetches the GTID set from group_replication_applier channel 86 FetchApplierGTIDSet(instanceKey *inst.InstanceKey) (mysql.GTIDSet, error) 87 88 // Failover move the mysql primary to the node defined by memberUUID 89 Failover(instance *inst.InstanceKey) error 90 91 // FetchGroupView fetches group related information 92 FetchGroupView(alias string, instanceKey *inst.InstanceKey) (*GroupView, error) 93 } 94 95 // MemberState is member state 96 type MemberState int 97 98 // MemberRole is member role 99 type MemberRole int 100 101 const ( 102 UNKNOWNSTATE MemberState = iota 103 OFFLINE 104 UNREACHABLE 105 RECOVERING 106 ONLINE 107 ERROR 108 ) 109 110 const ( 111 UNKNOWNROLE MemberRole = iota 112 SECONDARY 113 PRIMARY 114 ) 115 116 // GroupMember represents a ROW we get from performance_schema 117 type GroupMember struct { 118 HostName string 119 Port int 120 Role MemberRole 121 State MemberState 122 ReadOnly bool 123 } 124 125 // GroupView is an instance's view for the group 126 type GroupView struct { 127 TabletAlias string 128 MySQLHost string 129 MySQLPort int 130 GroupName string 131 HeartbeatStaleness int 132 UnresolvedMembers []*GroupMember 133 } 134 135 // SQLAgentImpl implements Agent 136 type SQLAgentImpl struct { 137 config *config.Configuration 138 dbFlavor string 139 enableHeartbeat bool 140 } 141 142 // NewGroupView creates a new GroupView 143 func NewGroupView(alias, host string, port int) *GroupView { 144 return &GroupView{TabletAlias: alias, MySQLHost: host, MySQLPort: port} 145 } 146 147 // NewGroupMember creates a new GroupMember 148 func NewGroupMember(state, role, host string, port int, readonly bool) *GroupMember { 149 return &GroupMember{ 150 State: toMemberState(state), 151 Role: toMemberRole(role), 152 HostName: host, 153 Port: port, 154 ReadOnly: readonly, 155 } 156 } 157 158 // NewVTGRSqlAgent creates a SQLAgentImpl 159 func NewVTGRSqlAgent() *SQLAgentImpl { 160 var conf *config.Configuration 161 if (configFilePath) != "" { 162 log.Infof("use config from %v", configFilePath) 163 conf = config.ForceRead(configFilePath) 164 } else { 165 log.Warningf("use default config") 166 conf = config.Config 167 } 168 agent := &SQLAgentImpl{ 169 config: conf, 170 dbFlavor: dbFlavor, 171 enableHeartbeat: enableHeartbeatCheck, 172 } 173 return agent 174 } 175 176 // BootstrapGroupLocked implements Agent interface 177 func (agent *SQLAgentImpl) BootstrapGroupLocked(instanceKey *inst.InstanceKey) error { 178 if instanceKey == nil { 179 return errors.New("nil instance key for bootstrap") 180 } 181 // Before bootstrap a group, double check locally there is really nothing running locally 182 uuid, state, err := agent.getGroupNameAndMemberState(instanceKey) 183 if err != nil { 184 return err 185 } 186 if state != "" && state != inst.GroupReplicationMemberStateOffline { 187 return fmt.Errorf("%v not OFFLINE mode %v [group_name=%v]", instanceKey.Hostname, state, uuid) 188 } 189 // If there is a group name stored locally, we should try to reuse it 190 // for port, we will override with a new one 191 if uuid == "" { 192 uuid = gouuid.New().String() 193 log.Infof("Try to bootstrap with a new uuid") 194 } 195 log.Infof("Bootstrap group on %v with %v", instanceKey.Hostname, uuid) 196 return agent.bootstrapInternal(instanceKey, uuid) 197 } 198 199 func (agent *SQLAgentImpl) RebootstrapGroupLocked(instanceKey *inst.InstanceKey, name string) error { 200 log.Infof("Rebootstrapping group on %v with %v", instanceKey.Hostname, name) 201 return agent.bootstrapInternal(instanceKey, name) 202 } 203 204 func (agent *SQLAgentImpl) bootstrapInternal(instanceKey *inst.InstanceKey, uuid string) error { 205 // Use persist to set group_replication_group_name 206 // so that the instance will persist the name after restart 207 cmds := []string{ 208 "set global offline_mode=0", 209 fmt.Sprintf("set @@persist.group_replication_group_name=\"%s\"", uuid), 210 fmt.Sprintf("set global group_replication_local_address=\"%s:%d\"", instanceKey.Hostname, mysqlGroupPort), 211 fmt.Sprintf("set global group_replication_group_seeds=\"%s:%d\"", instanceKey.Hostname, mysqlGroupPort), 212 "set global group_replication_bootstrap_group=ON", 213 fmt.Sprintf("start group_replication user='%s', password='%s'", agent.config.MySQLReplicaUser, agent.config.MySQLReplicaPassword), 214 "set global group_replication_bootstrap_group=OFF", 215 } 216 for _, cmd := range cmds { 217 if err := execInstanceWithTopo(instanceKey, cmd); err != nil { 218 log.Errorf("Failed to execute: %v: %v", cmd, err) 219 return err 220 } 221 } 222 return nil 223 } 224 225 // StopGroupLocked implements Agent interface 226 func (agent *SQLAgentImpl) StopGroupLocked(instanceKey *inst.InstanceKey) error { 227 cmd := "stop group_replication" 228 return execInstanceWithTopo(instanceKey, cmd) 229 } 230 231 // SetReadOnly implements Agent interface 232 func (agent *SQLAgentImpl) SetReadOnly(instanceKey *inst.InstanceKey, readOnly bool) error { 233 // Setting super_read_only ON implicitly forces read_only ON 234 // Setting read_only OFF implicitly forces super_read_only OFF 235 // https://www.perconaicom/blog/2016/09/27/using-the-super_read_only-system-variable/ 236 if readOnly { 237 return execInstance(instanceKey, "set @@global.super_read_only=1") 238 } 239 return execInstance(instanceKey, "set @@global.read_only=0") 240 } 241 242 // JoinGroupLocked implements Agent interface 243 // Note: caller should grab the lock before calling this 244 func (agent *SQLAgentImpl) JoinGroupLocked(instanceKey *inst.InstanceKey, primaryInstanceKey *inst.InstanceKey) error { 245 var numExistingMembers int 246 var uuid string 247 query := `select count(*) as count, @@group_replication_group_name as group_name 248 from performance_schema.replication_group_members where member_state='ONLINE'` 249 err := fetchInstance(primaryInstanceKey, query, func(m sqlutils.RowMap) error { 250 numExistingMembers = m.GetInt("count") 251 uuid = m.GetString("group_name") 252 return nil 253 }) 254 if err != nil { 255 return err 256 } 257 if numExistingMembers == 0 { 258 return fmt.Errorf("there is no group members found on %v:%v", primaryInstanceKey.Hostname, primaryInstanceKey.Port) 259 } 260 // The queries above are executed on the primary instance 261 // now let's do one more check with local information to make sure it's OK to join the primary 262 localGroup, state, err := agent.getGroupNameAndMemberState(instanceKey) 263 if err != nil { 264 return err 265 } 266 if localGroup != "" && localGroup != uuid { 267 return fmt.Errorf("%v has a different group name (%v) than primary %v (%v)", instanceKey.Hostname, localGroup, primaryInstanceKey.Hostname, uuid) 268 } 269 if state == inst.GroupReplicationMemberStateOnline || state == inst.GroupReplicationMemberStateRecovering { 270 return fmt.Errorf("%v [%v] is alredy in a group %v", instanceKey.Hostname, state, localGroup) 271 } 272 var primaryGrPort int 273 query = `select @@group_replication_local_address as address` 274 err = fetchInstance(primaryInstanceKey, query, func(m sqlutils.RowMap) error { 275 address := m.GetString("address") 276 arr := strings.Split(address, ":") 277 primaryGrPort, err = strconv.Atoi(arr[1]) 278 if err != nil { 279 log.Errorf("Failed to parse primary GR port: %v", err) 280 return err 281 } 282 return nil 283 }) 284 if primaryGrPort == 0 { 285 return fmt.Errorf("cannot find group replication port on %v", primaryInstanceKey.Hostname) 286 } 287 // Now it's safe to join the group 288 cmds := []string{ 289 "set global offline_mode=0", 290 fmt.Sprintf("set @@persist.group_replication_group_name=\"%s\"", uuid), 291 fmt.Sprintf("set global group_replication_group_seeds=\"%s:%d\"", primaryInstanceKey.Hostname, primaryGrPort), 292 fmt.Sprintf("set global group_replication_local_address=\"%s:%d\"", instanceKey.Hostname, mysqlGroupPort), 293 fmt.Sprintf("start group_replication user='%s', password='%s'", agent.config.MySQLReplicaUser, agent.config.MySQLReplicaPassword), 294 } 295 for _, cmd := range cmds { 296 if err := execInstanceWithTopo(instanceKey, cmd); err != nil { 297 return err 298 } 299 } 300 return nil 301 } 302 303 // Failover implements Agent interface 304 func (agent *SQLAgentImpl) Failover(instance *inst.InstanceKey) error { 305 var memberUUID string 306 query := `select member_id 307 from performance_schema.replication_group_members 308 where member_host=convert(@@hostname using ascii) and member_port=@@port and member_state='ONLINE'` 309 err := fetchInstance(instance, query, func(m sqlutils.RowMap) error { 310 memberUUID = m.GetString("member_id") 311 if memberUUID == "" { 312 return fmt.Errorf("unable to find member_id on %v", instance.Hostname) 313 } 314 return nil 315 }) 316 if err != nil { 317 return err 318 } 319 cmd := fmt.Sprintf(`select group_replication_set_as_primary('%s')`, memberUUID) 320 if err := execInstance(instance, cmd); err != nil { 321 return err 322 } 323 return nil 324 } 325 326 // heartbeatCheck returns heartbeat check freshness result 327 func (agent *SQLAgentImpl) heartbeatCheck(instanceKey *inst.InstanceKey) (int, error) { 328 query := `select timestampdiff(SECOND, from_unixtime(truncate(ts * 0.000000001, 0)), NOW()) as diff from _vt.heartbeat;` 329 var result int 330 err := fetchInstance(instanceKey, query, func(m sqlutils.RowMap) error { 331 result = m.GetInt("diff") 332 return nil 333 }) 334 return result, err 335 } 336 337 // FetchGroupView implements Agent interface 338 func (agent *SQLAgentImpl) FetchGroupView(alias string, instanceKey *inst.InstanceKey) (*GroupView, error) { 339 view := NewGroupView(alias, instanceKey.Hostname, instanceKey.Port) 340 var groupName string 341 var isReadOnly bool 342 query := `select 343 @@group_replication_group_name as group_name, 344 @@super_read_only as read_only, 345 member_host, member_port, member_state, member_role 346 from performance_schema.replication_group_members` 347 err := fetchInstance(instanceKey, query, func(m sqlutils.RowMap) error { 348 if groupName == "" { 349 groupName = m.GetString("group_name") 350 } 351 host := m.GetString("member_host") 352 port := m.GetInt("member_port") 353 isReadOnly = m.GetBool("read_only") 354 unresolvedMember := NewGroupMember( 355 m.GetString("member_state"), 356 m.GetString("member_role"), 357 host, 358 port, 359 false) 360 // readOnly is used to re-enable write after we set primary to read_only to protect the shard when there is 361 // less than desired number of nodes 362 // the default value is false because if the node is reachable and read_only, it will get override by the OR op 363 // if the host is unreachable, we don't need to trigger the protection for it therefore assume the it's writable 364 if host == instanceKey.Hostname && port == instanceKey.Port && isReadOnly { 365 unresolvedMember.ReadOnly = true 366 } 367 view.UnresolvedMembers = append(view.UnresolvedMembers, unresolvedMember) 368 return nil 369 }) 370 view.GroupName = groupName 371 if err != nil { 372 return nil, err 373 } 374 view.HeartbeatStaleness = math.MaxInt32 375 if agent.enableHeartbeat { 376 heartbeatStaleness, err := agent.heartbeatCheck(instanceKey) 377 if err != nil { 378 // We can run into Error 1146: Table '_vt.heartbeat' doesn't exist on new provisioned shard: 379 // vtgr is checking heartbeat table 380 // -> heartbeat table is waiting primary tablet 381 // -> primary tablet needs vtgr. 382 // 383 // Therefore if we run into error, HeartbeatStaleness will 384 // remain to be max int32, which is 2147483647 sec 385 log.Errorf("Failed to check heartbeatCheck: %v", err) 386 } else { 387 view.HeartbeatStaleness = heartbeatStaleness 388 } 389 } 390 return view, nil 391 } 392 393 // GetPrimaryView returns the view of primary member 394 func (view *GroupView) GetPrimaryView() (string, int, bool) { 395 for _, member := range view.UnresolvedMembers { 396 if member.Role == PRIMARY { 397 return member.HostName, member.Port, member.State == ONLINE 398 } 399 } 400 return "", 0, false 401 } 402 403 func (agent *SQLAgentImpl) getGroupNameAndMemberState(instanceKey *inst.InstanceKey) (string, string, error) { 404 // If there is an instance that is unreachable but we still have quorum, GR will remove it from 405 // the replication_group_members and Failover if it is the primary node 406 // If the state becomes UNREACHABLE it indicates there is a network partition inside the group 407 // https://dev.mysql.com/doc/refman/8.0/en/group-replication-network-partitioning.html 408 // And then eventually if the node does not recover, the group will transit into ERROR state 409 // VTGR cannot handle this case, therefore we raise error here 410 var name, state string 411 query := `select @@group_replication_group_name as group_name` 412 err := fetchInstance(instanceKey, query, func(m sqlutils.RowMap) error { 413 name = m.GetString("group_name") 414 return nil 415 }) 416 if err != nil { 417 return "", "", err 418 } 419 query = `select member_state 420 from performance_schema.replication_group_members 421 where member_host=convert(@@hostname using ascii) and member_port=@@port` 422 err = fetchInstance(instanceKey, query, func(m sqlutils.RowMap) error { 423 state = m.GetString("member_state") 424 if state == "" { 425 state = inst.GroupReplicationMemberStateOffline 426 } 427 return nil 428 }) 429 if err != nil { 430 return "", "", err 431 } 432 return name, state, nil 433 } 434 435 // FetchApplierGTIDSet implements Agent interface 436 func (agent *SQLAgentImpl) FetchApplierGTIDSet(instanceKey *inst.InstanceKey) (mysql.GTIDSet, error) { 437 var gtidSet string 438 // TODO: should we also take group_replication_recovery as well? 439 query := `select gtid_subtract(concat(received_transaction_set, ',', @@global.gtid_executed), '') as gtid_set 440 from performance_schema.replication_connection_status 441 where channel_name='group_replication_applier'` 442 err := fetchInstance(instanceKey, query, func(m sqlutils.RowMap) error { 443 // If the instance has no committed transaction, gtidSet will be empty string 444 gtidSet = m.GetString("gtid_set") 445 return nil 446 }) 447 if err != nil { 448 return nil, err 449 } 450 pos, err := mysql.ParsePosition(agent.dbFlavor, gtidSet) 451 if err != nil { 452 return nil, err 453 } 454 return pos.GTIDSet, nil 455 } 456 457 // execInstance executes a given query on the given MySQL discovery instance 458 func execInstance(instanceKey *inst.InstanceKey, query string, args ...any) error { 459 if err := verifyInstance(instanceKey); err != nil { 460 return err 461 } 462 sqlDb, err := OpenDiscovery(instanceKey.Hostname, instanceKey.Port) 463 if err != nil { 464 log.Errorf("error exec %v: %v", query, err) 465 return err 466 } 467 _, err = sqlutils.ExecNoPrepare(sqlDb, query, args...) 468 return err 469 } 470 471 // execInstanceWithTopo executes a given query on the given MySQL topology instance 472 func execInstanceWithTopo(instanceKey *inst.InstanceKey, query string, args ...any) error { 473 if err := verifyInstance(instanceKey); err != nil { 474 return err 475 } 476 sqlDb, err := OpenTopology(instanceKey.Hostname, instanceKey.Port) 477 if err != nil { 478 log.Errorf("error exec %v: %v", query, err) 479 return err 480 } 481 _, err = sqlutils.ExecNoPrepare(sqlDb, query, args...) 482 return err 483 } 484 485 // fetchInstance fetches result from mysql 486 func fetchInstance(instanceKey *inst.InstanceKey, query string, onRow func(sqlutils.RowMap) error) error { 487 if err := verifyInstance(instanceKey); err != nil { 488 return err 489 } 490 sqlDb, err := OpenDiscovery(instanceKey.Hostname, instanceKey.Port) 491 if err != nil { 492 return err 493 } 494 return sqlutils.QueryRowsMap(sqlDb, query, onRow) 495 } 496 497 // The hostname and port can be empty if a tablet crashed and did not populate them in 498 // the topo server. We treat them as if the host is unreachable when we calculate the 499 // quorum for the shard. 500 func verifyInstance(instanceKey *inst.InstanceKey) error { 501 if instanceKey.Hostname == "" || instanceKey.Port == 0 { 502 return ErrInvalidInstance 503 } 504 return nil 505 } 506 507 // CreateInstanceKey returns an InstanceKey based on group member input 508 // When the group is init for the first time, the hostname and port are not set, e.g., 509 // +---------------------------+-----------+-------------+-------------+--------------+-------------+ 510 // | CHANNEL_NAME | MEMBER_ID | MEMBER_HOST | MEMBER_PORT | MEMBER_STATE | MEMBER_ROLE | 511 // +---------------------------+-----------+-------------+-------------+--------------+-------------+ 512 // | group_replication_applier | | | NULL | OFFLINE | | 513 // +---------------------------+-----------+-------------+-------------+--------------+-------------+ 514 // therefore we substitute with view's local hostname and port 515 func (view *GroupView) CreateInstanceKey(member *GroupMember) inst.InstanceKey { 516 if member.HostName == "" && member.Port == 0 { 517 return inst.InstanceKey{ 518 Hostname: view.MySQLHost, 519 Port: view.MySQLPort, 520 } 521 } 522 return inst.InstanceKey{ 523 Hostname: member.HostName, 524 Port: member.Port, 525 } 526 } 527 528 // ToString make string for group view 529 func (view *GroupView) ToString() string { 530 var sb strings.Builder 531 sb.WriteString(fmt.Sprintf("group_name:%v\n", view.GroupName)) 532 for _, m := range view.UnresolvedMembers { 533 sb.WriteString(fmt.Sprintf("host:%v:%v | role:%v | state:%v\n", m.HostName, m.Port, m.Role, m.State)) 534 } 535 return sb.String() 536 } 537 538 func (state MemberState) String() string { 539 switch state { 540 case ONLINE: 541 return inst.GroupReplicationMemberStateOnline 542 case ERROR: 543 return inst.GroupReplicationMemberStateError 544 case RECOVERING: 545 return inst.GroupReplicationMemberStateRecovering 546 case OFFLINE: 547 return inst.GroupReplicationMemberStateOffline 548 case UNREACHABLE: 549 return inst.GroupReplicationMemberStateUnreachable 550 } 551 return "UNKNOWN" 552 } 553 554 func toMemberState(state string) MemberState { 555 switch state { 556 case inst.GroupReplicationMemberStateOnline: 557 return ONLINE 558 case inst.GroupReplicationMemberStateError: 559 return ERROR 560 case inst.GroupReplicationMemberStateRecovering: 561 return RECOVERING 562 case inst.GroupReplicationMemberStateOffline: 563 return OFFLINE 564 case inst.GroupReplicationMemberStateUnreachable: 565 return UNREACHABLE 566 default: 567 return UNKNOWNSTATE 568 } 569 } 570 571 func (role MemberRole) String() string { 572 switch role { 573 case PRIMARY: 574 return inst.GroupReplicationMemberRolePrimary 575 case SECONDARY: 576 return inst.GroupReplicationMemberRoleSecondary 577 } 578 return "UNKNOWN" 579 } 580 581 func toMemberRole(role string) MemberRole { 582 switch role { 583 case inst.GroupReplicationMemberRolePrimary: 584 return PRIMARY 585 case inst.GroupReplicationMemberRoleSecondary: 586 return SECONDARY 587 default: 588 return UNKNOWNROLE 589 } 590 }