vitess.io/vitess@v0.16.2/go/vt/vtgr/controller/repair.go (about) 1 /* 2 Copyright 2021 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package controller 18 19 import ( 20 "context" 21 "errors" 22 "fmt" 23 "sort" 24 "strconv" 25 "sync" 26 "time" 27 28 "github.com/spf13/pflag" 29 30 "vitess.io/vitess/go/mysql" 31 "vitess.io/vitess/go/stats" 32 "vitess.io/vitess/go/vt/concurrency" 33 "vitess.io/vitess/go/vt/servenv" 34 "vitess.io/vitess/go/vt/topo" 35 "vitess.io/vitess/go/vt/vterrors" 36 "vitess.io/vitess/go/vt/vtgr/db" 37 38 topodatapb "vitess.io/vitess/go/vt/proto/topodata" 39 ) 40 41 var ( 42 repairTimingsMs = stats.NewMultiTimings("repairTimingsMs", "time vtgr takes to repair", []string{"status", "success"}) 43 unexpectedLockLost = stats.NewCountersWithMultiLabels("unexpectedLockLost", "unexpected lost of the lock", []string{"Keyspace", "Shard"}) 44 45 abortRebootstrap bool 46 ) 47 48 func init() { 49 servenv.OnParseFor("vtgr", func(fs *pflag.FlagSet) { 50 fs.BoolVar(&abortRebootstrap, "abort_rebootstrap", false, "Don't allow vtgr to rebootstrap an existing group.") 51 }) 52 } 53 54 // RepairResultCode is the code for repair 55 type RepairResultCode string 56 57 const ( 58 // Success means successfully repaired 59 Success RepairResultCode = "Success" 60 // Fail means failed to repaire 61 Fail RepairResultCode = "Fail" 62 // Noop means do nothing 63 Noop RepairResultCode = "Noop" 64 ) 65 66 // Repair tries to fix shard based on the diagnose type 67 func (shard *GRShard) Repair(ctx context.Context, status DiagnoseType) (RepairResultCode, error) { 68 shard.Lock() 69 defer shard.Unlock() 70 var err error 71 code := Noop 72 switch status { 73 case DiagnoseTypeShardHasNoGroup: 74 code, err = shard.repairShardHasNoGroup(ctx) 75 case DiagnoseTypeShardHasInactiveGroup: 76 code, err = shard.repairShardHasInactiveGroup(ctx) 77 case DiagnoseTypeWrongPrimaryTablet: 78 code, err = shard.repairWrongPrimaryTablet(ctx) 79 case DiagnoseTypeUnconnectedReplica: 80 code, err = shard.repairUnconnectedReplica(ctx) 81 case DiagnoseTypeUnreachablePrimary: 82 code, err = shard.repairUnreachablePrimary(ctx) 83 case DiagnoseTypeInsufficientGroupSize: 84 code, err = shard.repairInsufficientGroupSize(ctx) 85 case DiagnoseTypeReadOnlyShard: 86 code, err = shard.repairReadOnlyShard(ctx) 87 case DiagnoseTypeBootstrapBackoff, DiagnoseTypeBackoffError: 88 code, err = shard.repairBackoffError(ctx, status) 89 case DiagnoseTypeError: 90 shard.logger.Errorf("%v is %v", formatKeyspaceShard(shard.KeyspaceShard), status) 91 case DiagnoseTypeHealthy: 92 start := time.Now() 93 repairTimingsMs.Record([]string{string(status), "true"}, start) 94 } 95 if status != DiagnoseTypeHealthy { 96 shard.logger.Infof("VTGR repaired %v status=%v | code=%v", formatKeyspaceShard(shard.KeyspaceShard), status, code) 97 } 98 return code, vterrors.Wrap(err, "vtgr repair") 99 } 100 101 func (shard *GRShard) repairShardHasNoGroup(ctx context.Context) (RepairResultCode, error) { 102 ctx, err := shard.LockShard(ctx, "repairShardHasNoGroup") 103 if err != nil { 104 shard.logger.Warningf("repairShardHasNoPrimaryTablet fails to grab lock for the shard %v: %v", shard.KeyspaceShard, err) 105 return Noop, err 106 } 107 defer shard.UnlockShard() 108 shard.refreshTabletsInShardLocked(ctx) 109 // Diagnose() will call shardAgreedGroup as the first thing 110 // which will update mysqlGroup stored in the shard 111 status, err := shard.diagnoseLocked(ctx) 112 if err != nil { 113 shard.logger.Errorf("Failed to diagnose: %v", err) 114 return Fail, err 115 } 116 if status != DiagnoseTypeShardHasNoGroup { 117 shard.logger.Infof("Shard %v is no longer in DiagnoseTypeShardHasNoGroup: %v", formatKeyspaceShard(shard.KeyspaceShard), status) 118 return Noop, nil 119 } 120 start := time.Now() 121 err = shard.repairShardHasNoGroupAction(ctx) 122 repairTimingsMs.Record([]string{DiagnoseTypeShardHasNoGroup, strconv.FormatBool(err == nil)}, start) 123 if err != nil { 124 return Fail, err 125 } 126 return Success, nil 127 } 128 129 func (shard *GRShard) repairShardHasNoGroupAction(ctx context.Context) error { 130 // If group is not empty AND there is at least one active group member 131 // we don't need to bootstrap. Instead we should try to join the group 132 mysqlGroup := shard.shardAgreedGroupName() 133 isAllOffline := shard.isAllOfflineOrError() 134 if mysqlGroup != "" { 135 shard.logger.Infof("Shard %v already have a group %v", formatKeyspaceShard(shard.KeyspaceShard), mysqlGroup) 136 return nil 137 } 138 // This should not really happen in reality 139 if mysqlGroup == "" && !isAllOffline { 140 return fmt.Errorf("shard %v has empty group name but some node is not OFFLINE", formatKeyspaceShard(shard.KeyspaceShard)) 141 } 142 143 // Now we know group is null and there is no active node 144 // we should bootstrap the group 145 replicas := shard.instances 146 // Sanity check to make sure there is at least one instance 147 if len(replicas) == 0 { 148 shard.logger.Warningf("Cannot find any instance for the shard %v", formatKeyspaceShard(shard.KeyspaceShard)) 149 return nil 150 } 151 if !shard.sqlGroup.IsSafeToBootstrap() { 152 return errors.New("unsafe to bootstrap group") 153 } 154 var candidate *grInstance 155 sort.SliceStable(replicas, func(i, j int) bool { 156 return replicas[i].alias < replicas[j].alias 157 }) 158 for _, replica := range replicas { 159 if !shard.shardStatusCollector.isUnreachable(replica) { 160 candidate = replica 161 break 162 } 163 } 164 if candidate == nil { 165 return errors.New("fail to find any candidate to bootstrap") 166 } 167 // Bootstrap the group 168 shard.logger.Infof("Bootstrapping the group for %v on host=%v", formatKeyspaceShard(shard.KeyspaceShard), candidate.instanceKey.Hostname) 169 // Make sure we still hold the topo server lock before moving on 170 if err := shard.checkShardLocked(ctx); err != nil { 171 return err 172 } 173 if err := shard.dbAgent.BootstrapGroupLocked(candidate.instanceKey); err != nil { 174 // if bootstrap failed, the next one that gets the lock will try to do it again 175 shard.logger.Errorf("Failed to bootstrap mysql group on %v: %v", candidate.instanceKey.Hostname, err) 176 return err 177 } 178 shard.logger.Infof("Bootstrapped the group for %v", formatKeyspaceShard(shard.KeyspaceShard)) 179 return nil 180 } 181 182 func (shard *GRShard) repairShardHasInactiveGroup(ctx context.Context) (RepairResultCode, error) { 183 ctx, err := shard.LockShard(ctx, "repairShardHasInactiveGroup") 184 if err != nil { 185 shard.logger.Warningf("repairShardHasInactiveGroup fails to grab lock for the shard %v: %v", shard.KeyspaceShard, err) 186 return Noop, err 187 } 188 defer shard.UnlockShard() 189 shard.refreshTabletsInShardLocked(ctx) 190 // Diagnose() will call shardAgreedGroup as the first thing 191 // which will update mysqlGroup stored in the shard 192 status, err := shard.diagnoseLocked(ctx) 193 if err != nil { 194 shard.logger.Errorf("Failed to diagnose: %v", err) 195 return Fail, err 196 } 197 if status != DiagnoseTypeShardHasInactiveGroup { 198 shard.logger.Infof("Shard %v is no longer in DiagnoseTypeShardHasInactiveGroup: %v", formatKeyspaceShard(shard.KeyspaceShard), status) 199 return Noop, nil 200 } 201 // Now we know the shard has an agreed group but no member in it 202 // We should find one with the largest GTID set as the 203 // new mysql primary to bootstrap the group 204 start := time.Now() 205 err = shard.stopAndRebootstrap(ctx) 206 repairTimingsMs.Record([]string{DiagnoseTypeShardHasInactiveGroup, strconv.FormatBool(err == nil)}, start) 207 if err != nil { 208 return Fail, err 209 } 210 return Success, nil 211 } 212 213 func (shard *GRShard) repairBackoffError(ctx context.Context, diagnose DiagnoseType) (RepairResultCode, error) { 214 ctx, err := shard.LockShard(ctx, "repairBackoffError") 215 if err != nil { 216 shard.logger.Warningf("repairBackoffError fails to grab lock for the shard %v: %v", shard.KeyspaceShard, err) 217 return Noop, err 218 } 219 defer shard.UnlockShard() 220 shard.refreshTabletsInShardLocked(ctx) 221 status, err := shard.diagnoseLocked(ctx) 222 if err != nil { 223 shard.logger.Errorf("Failed to diagnose: %v", err) 224 return Fail, err 225 } 226 if status != diagnose { 227 shard.logger.Infof("Shard %v is no longer in %v: %v", formatKeyspaceShard(shard.KeyspaceShard), diagnose, status) 228 return Noop, nil 229 } 230 if shard.lastDiagnoseResult != diagnose { 231 shard.logger.Infof("diagnose shard as %v but last diagnose result was %v", diagnose, shard.lastDiagnoseResult) 232 return Noop, nil 233 } 234 now := time.Now() 235 var waitTime time.Duration 236 switch diagnose { 237 case DiagnoseTypeBackoffError: 238 waitTime = shard.transientErrorWaitTime 239 case DiagnoseTypeBootstrapBackoff: 240 waitTime = shard.bootstrapWaitTime 241 default: 242 return Fail, fmt.Errorf("unsupported diagnose for repairBackoffError: %v", diagnose) 243 } 244 if now.Sub(shard.lastDiagnoseSince) < waitTime { 245 shard.logger.Infof("Detected %v at %v. In wait time for network partition", diagnose, shard.lastDiagnoseSince) 246 return Noop, nil 247 } 248 shard.logger.Infof("Detected %v at %v. Start repairing after %v", diagnose, shard.lastDiagnoseSince, shard.transientErrorWaitTime) 249 err = shard.stopAndRebootstrap(ctx) 250 repairTimingsMs.Record([]string{DiagnoseTypeBackoffError, strconv.FormatBool(err == nil)}, now) 251 if err != nil { 252 return Fail, err 253 } 254 return Success, nil 255 } 256 257 func (shard *GRShard) stopAndRebootstrap(ctx context.Context) error { 258 // Make sure we still hold the topo server lock before moving on 259 if err := shard.checkShardLocked(ctx); err != nil { 260 return err 261 } 262 // Before bootstrap the group, we need to stop group first 263 // abort aggressively here as soon as we encounter an error 264 // StopGroupLocked will check if instance is NOT in "ONLINE"/"RECOVERING" state (i.e., UNREACHABLE, ERROR or OFFLINE) 265 errorRecorder := shard.forAllInstances(func(instance *grInstance, wg *sync.WaitGroup, er concurrency.ErrorRecorder) { 266 defer wg.Done() 267 status := shard.sqlGroup.GetStatus(instance.instanceKey) 268 if status != nil && status.State == db.OFFLINE { 269 shard.logger.Infof("stop group replication on %v skipped because it is already OFFLINE", instance.alias) 270 return 271 } 272 shard.logger.Infof("stop group replication on %v", instance.alias) 273 err := shard.dbAgent.StopGroupLocked(instance.instanceKey) 274 if err != nil { 275 if !unreachableError(err) { 276 er.RecordError(err) 277 } 278 shard.logger.Warningf("Error during stop group replication on %v: %v", instance.instanceKey.Hostname, err) 279 } 280 }) 281 // We don't check allowPartialUnhealthyNodes here because we don't record unreachableError here 282 // hence if errorRecorder has error, it indicates the mysqld is still reachable but there is nothing 283 // else went wrong. 284 if errorRecorder.HasErrors() { 285 shard.logger.Errorf("Failed to stop group replication %v", errorRecorder.Error()) 286 return errorRecorder.Error() 287 } 288 shard.logger.Infof("Stop the group for %v", formatKeyspaceShard(shard.KeyspaceShard)) 289 shard.logger.Info("Start find candidate to rebootstrap") 290 candidate, err := shard.findRebootstrapCandidate(ctx) 291 if err != nil { 292 shard.logger.Errorf("Failed to find rebootstrap candidate: %v", err) 293 return err 294 } 295 shard.refreshSQLGroup() 296 if !shard.sqlGroup.IsSafeToRebootstrap() { 297 return errors.New("unsafe to bootstrap group") 298 } 299 if abortRebootstrap { 300 shard.logger.Warningf("Abort stopAndRebootstrap because rebootstrap hook override") 301 return errForceAbortBootstrap 302 } 303 shard.logger.Infof("Rebootstrap %v on %v", formatKeyspaceShard(shard.KeyspaceShard), candidate.instanceKey.Hostname) 304 // Make sure we still hold the topo server lock before moving on 305 if err := shard.checkShardLocked(ctx); err != nil { 306 return err 307 } 308 uuid := shard.sqlGroup.GetGroupName() 309 if uuid == "" { 310 return errors.New("trying to rebootstrap without uuid") 311 } 312 return shard.dbAgent.RebootstrapGroupLocked(candidate.instanceKey, uuid) 313 } 314 315 // allowPartialUnhealthyNodes returns true if rebootstrapSize is set to non-zero 316 // and the error we get is less than (total_num_tablet - rebootstrapSize) 317 func (shard *GRShard) allowPartialUnhealthyNodes(errorRecorder *concurrency.AllErrorRecorder) bool { 318 if shard.sqlGroup.rebootstrapSize != 0 && len(shard.instances)-shard.sqlGroup.rebootstrapSize >= len(errorRecorder.GetErrors()) { 319 shard.logger.Warningf("Allow unhealthy nodes during the reboot group_size=%v, rebootstrap_config=%v, error=%v", shard.sqlGroup.expectedBootstrapSize, shard.sqlGroup.rebootstrapSize, len(errorRecorder.GetErrors())) 320 return true 321 } 322 return false 323 } 324 325 func (shard *GRShard) getGTIDSetFromAll(skipPrimary bool) (*groupGTIDRecorder, *concurrency.AllErrorRecorder, error) { 326 if len(shard.instances) == 0 { 327 return nil, nil, fmt.Errorf("%v has 0 instance", formatKeyspaceShard(shard.KeyspaceShard)) 328 } 329 // Before we do failover, we first verify if there is no one agreed group name. 330 // If not, VTGR is not smart enough to figure out how to failover 331 // Note: the caller should make sure the mysqlGroup is refreshed after we grab a shard level lock 332 mysqlGroup := shard.shardAgreedGroupName() 333 if mysqlGroup == "" { 334 return nil, nil, fmt.Errorf("unable to find an agreed group name in %v", formatKeyspaceShard(shard.KeyspaceShard)) 335 } 336 primary := shard.findShardPrimaryTablet() 337 var mysqlPrimaryHost string 338 var mysqlPrimaryPort int 339 // skipPrimary is true when we manual failover or if there is a unreachalbe primary tablet 340 // in both case, there should be a reconciled primary tablet 341 if skipPrimary && primary != nil { 342 status := shard.sqlGroup.GetStatus(primary.instanceKey) 343 mysqlPrimaryHost, mysqlPrimaryPort = status.HostName, status.Port 344 shard.logger.Infof("Found primary instance from MySQL on %v", mysqlPrimaryHost) 345 } 346 gtidRecorder := &groupGTIDRecorder{} 347 // Iterate through all the instances in the shard and find the one with largest GTID set with best effort 348 // We wrap it with forAllInstances so that the failover can continue if there is a host 349 // that is unreachable 350 errorRecorder := shard.forAllInstances(func(instance *grInstance, wg *sync.WaitGroup, er concurrency.ErrorRecorder) { 351 defer wg.Done() 352 if skipPrimary && instance.instanceKey.Hostname == mysqlPrimaryHost && instance.instanceKey.Port == mysqlPrimaryPort { 353 shard.logger.Infof("Skip %v to failover to a non-primary node", mysqlPrimaryHost) 354 return 355 } 356 gtids, err := shard.dbAgent.FetchApplierGTIDSet(instance.instanceKey) 357 if err != nil { 358 er.RecordError(err) 359 shard.logger.Errorf("%v get error while fetch applier GTIDs: %v", instance.alias, err) 360 shard.shardStatusCollector.recordProblematics(instance) 361 if unreachableError(err) { 362 shard.shardStatusCollector.recordUnreachables(instance) 363 } 364 return 365 } 366 if gtids == nil { 367 shard.logger.Warningf("[failover candidate] skip %s with empty gtid", instance.alias) 368 return 369 } 370 gtidRecorder.recordGroupGTIDs(gtids, instance) 371 }) 372 return gtidRecorder, errorRecorder, nil 373 } 374 375 func (shard *GRShard) findRebootstrapCandidate(ctx context.Context) (*grInstance, error) { 376 gtidRecorder, errorRecorder, err := shard.getGTIDSetFromAll(false) 377 if err != nil { 378 shard.logger.Errorf("Failed to get gtid from all: %v", err) 379 return nil, err 380 } 381 err = errorRecorder.Error() 382 // We cannot tolerate any error from mysql during a rebootstrap. 383 if err != nil && !shard.allowPartialUnhealthyNodes(errorRecorder) { 384 shard.logger.Errorf("Failed to fetch all GTID with forAllInstances for rebootstrap: %v", err) 385 return nil, err 386 } 387 candidate, err := shard.findFailoverCandidateFromRecorder(ctx, gtidRecorder, nil) 388 if err != nil { 389 shard.logger.Errorf("Failed to find rebootstrap candidate by GTID after forAllInstances: %v", err) 390 return nil, err 391 } 392 if candidate == nil { 393 return nil, fmt.Errorf("failed to find rebootstrap candidate for %v", formatKeyspaceShard(shard.KeyspaceShard)) 394 } 395 if !shard.instanceReachable(ctx, candidate) { 396 shard.logger.Errorf("rebootstrap candidate %v (%v) is not reachable via ping", candidate.alias, candidate.instanceKey.Hostname) 397 return nil, fmt.Errorf("%v is unreachable", candidate.alias) 398 } 399 shard.logger.Infof("%v is the rebootstrap candidate", candidate.alias) 400 return candidate, nil 401 } 402 403 // Caller of this function should make sure it gets the shard lock and it has the 404 // latest view of a shard. Otherwise, we might skip the wrong node when we locate the candidate 405 func (shard *GRShard) findFailoverCandidate(ctx context.Context) (*grInstance, error) { 406 gtidRecorder, errorRecorder, err := shard.getGTIDSetFromAll(true) 407 if err != nil { 408 shard.logger.Errorf("Failed to get gtid from all: %v", err) 409 return nil, err 410 } 411 err = errorRecorder.Error() 412 // During the repair for unreachable primary we still have a mysql group. 413 // Failover within the group is safe, finding the largest GTID is an optimization. 414 // therefore we don't check error from errorRecorder just log it 415 if err != nil { 416 shard.logger.Warningf("Errors when fetch all GTID with forAllInstances for failover: %v", err) 417 } 418 shard.forAllInstances(func(instance *grInstance, wg *sync.WaitGroup, er concurrency.ErrorRecorder) { 419 defer wg.Done() 420 if !shard.instanceReachable(ctx, instance) { 421 shard.logger.Errorf("%v is not reachable via ping", instance.alias) 422 shard.shardStatusCollector.recordProblematics(instance) 423 shard.shardStatusCollector.recordUnreachables(instance) 424 } 425 }) 426 var candidate *grInstance 427 candidate, err = shard.findFailoverCandidateFromRecorder(ctx, gtidRecorder, func(c context.Context, instance *grInstance) bool { 428 return !shard.shardStatusCollector.isUnreachable(instance) 429 }) 430 if err != nil { 431 shard.logger.Errorf("Failed to find failover candidate by GTID after forAllInstances: %v", err) 432 return nil, err 433 } 434 if candidate == nil { 435 return nil, fmt.Errorf("failed to find failover candidate for %v", formatKeyspaceShard(shard.KeyspaceShard)) 436 } 437 shard.logger.Infof("%v is the failover candidate", candidate.alias) 438 return candidate, nil 439 } 440 441 func (shard *GRShard) repairWrongPrimaryTablet(ctx context.Context) (RepairResultCode, error) { 442 ctx, err := shard.LockShard(ctx, "repairWrongPrimaryTablet") 443 if err != nil { 444 shard.logger.Warningf("repairWrongPrimaryTablet fails to grab lock for the shard %v: %v", shard.KeyspaceShard, err) 445 return Noop, err 446 } 447 defer shard.UnlockShard() 448 // We grab shard level lock and check again if there is no primary 449 // to avoid race conditions 450 shard.refreshTabletsInShardLocked(ctx) 451 status, err := shard.diagnoseLocked(ctx) 452 if err != nil { 453 shard.logger.Errorf("Failed to diagnose: %v", err) 454 return Fail, err 455 } 456 if status != DiagnoseTypeWrongPrimaryTablet { 457 shard.logger.Infof("Shard %v is no longer in DiagnoseTypeWrongPrimaryTablet: %v", formatKeyspaceShard(shard.KeyspaceShard), status) 458 return Noop, nil 459 } 460 start := time.Now() 461 err = shard.fixPrimaryTabletLocked(ctx) 462 repairTimingsMs.Record([]string{DiagnoseTypeWrongPrimaryTablet, strconv.FormatBool(err == nil)}, start) 463 if err != nil { 464 return Fail, err 465 } 466 return Success, nil 467 } 468 469 // fixPrimaryTabletLocked changes Vitess primary tablet based on mysql group 470 func (shard *GRShard) fixPrimaryTabletLocked(ctx context.Context) error { 471 host, port, isActive := shard.sqlGroup.GetPrimary() 472 if !isActive { 473 return db.ErrGroupInactive 474 } 475 // Primary tablet does not run mysql primary, we need to change it accordingly 476 candidate := shard.findTabletByHostAndPort(host, port) 477 if candidate == nil { 478 return errMissingPrimaryTablet 479 } 480 // Make sure we still hold the topo server lock before moving on 481 if err := shard.checkShardLocked(ctx); err != nil { 482 return err 483 } 484 err := shard.tmc.ChangeType(ctx, candidate.tablet, topodatapb.TabletType_PRIMARY, false) 485 if err != nil { 486 return fmt.Errorf("failed to change type to primary on %v: %v", candidate.alias, err) 487 } 488 shard.logger.Infof("Successfully make %v the primary tablet", candidate.alias) 489 return nil 490 } 491 492 // repairUnconnectedReplica usually handle the case when there is a DiagnoseTypeHealthy tablet and 493 // it is not connected to mysql primary node 494 func (shard *GRShard) repairUnconnectedReplica(ctx context.Context) (RepairResultCode, error) { 495 ctx, err := shard.LockShard(ctx, "repairUnconnectedReplica") 496 if err != nil { 497 shard.logger.Warningf("repairUnconnectedReplica fails to grab lock for the shard %v: %v", formatKeyspaceShard(shard.KeyspaceShard), err) 498 return Noop, err 499 } 500 defer shard.UnlockShard() 501 shard.refreshTabletsInShardLocked(ctx) 502 status, err := shard.diagnoseLocked(ctx) 503 if err != nil { 504 shard.logger.Errorf("Failed to diagnose: %v", err) 505 return Fail, err 506 } 507 if status != DiagnoseTypeUnconnectedReplica { 508 shard.logger.Infof("Shard %v is no longer in DiagnoseTypeUnconnectedReplica: %v", formatKeyspaceShard(shard.KeyspaceShard), status) 509 return Noop, nil 510 } 511 start := time.Now() 512 err = shard.repairUnconnectedReplicaAction(ctx) 513 repairTimingsMs.Record([]string{DiagnoseTypeUnconnectedReplica, strconv.FormatBool(err == nil)}, start) 514 if err != nil { 515 return Fail, err 516 } 517 return Success, nil 518 } 519 520 func (shard *GRShard) repairUnconnectedReplicaAction(ctx context.Context) error { 521 primaryInstance := shard.findShardPrimaryTablet() 522 target, err := shard.disconnectedInstance() 523 if err != nil { 524 return err 525 } 526 if target == nil { 527 shard.logger.Infof("there is no instance without group for %v", formatKeyspaceShard(shard.KeyspaceShard)) 528 return nil 529 } 530 shard.logger.Infof("Connecting replica %v to %v", target.instanceKey.Hostname, primaryInstance.instanceKey.Hostname) 531 status := shard.sqlGroup.GetStatus(target.instanceKey) 532 // Make sure we still hold the topo server lock before moving on 533 if err := shard.checkShardLocked(ctx); err != nil { 534 return err 535 } 536 if status != nil && status.State != db.OFFLINE { 537 shard.logger.Infof("stop group replication on %v (%v) before join the group", target.alias, status.State) 538 err := shard.dbAgent.StopGroupLocked(target.instanceKey) 539 if err != nil { 540 shard.logger.Errorf("Failed to stop group replication on %v: %v", target.instanceKey.Hostname, err) 541 return err 542 } 543 // Make sure we still hold the topo server lock before moving on 544 if err := shard.checkShardLocked(ctx); err != nil { 545 return err 546 } 547 } 548 return shard.dbAgent.JoinGroupLocked(target.instanceKey, primaryInstance.instanceKey) 549 } 550 551 func (shard *GRShard) repairUnreachablePrimary(ctx context.Context) (RepairResultCode, error) { 552 ctx, err := shard.LockShard(ctx, "repairUnreachablePrimary") 553 if err != nil { 554 shard.logger.Warningf("repairUnreachablePrimary fails to grab lock for the shard %v: %v", formatKeyspaceShard(shard.KeyspaceShard), err) 555 return Noop, err 556 } 557 defer shard.UnlockShard() 558 shard.refreshTabletsInShardLocked(ctx) 559 status, err := shard.diagnoseLocked(ctx) 560 if err != nil { 561 shard.logger.Errorf("Failed to diagnose: %v", err) 562 return Fail, err 563 } 564 if status != DiagnoseTypeUnreachablePrimary { 565 shard.logger.Infof("Shard %v is no longer in DiagnoseTypeUnreachablePrimary: %v", formatKeyspaceShard(shard.KeyspaceShard), status) 566 return Noop, nil 567 } 568 // We are here because either: 569 // 1. we have a primary tablet, but it's not reachable 570 // 2. we cannot find primary tablet but we do have a mysql group 571 // we need to failover mysql manually 572 // 573 // other case will be handled by different testGroupInput, e.g., 574 // has reachable primary tablet, but run on different node than mysql -> DiagnoseTypeWrongPrimaryTablet 575 start := time.Now() 576 err = shard.failoverLocked(ctx) 577 repairTimingsMs.Record([]string{DiagnoseTypeUnreachablePrimary, strconv.FormatBool(err == nil)}, start) 578 if err != nil { 579 return Fail, err 580 } 581 return Success, nil 582 } 583 584 func (shard *GRShard) repairInsufficientGroupSize(ctx context.Context) (RepairResultCode, error) { 585 ctx, err := shard.LockShard(ctx, "repairInsufficientGroupSize") 586 if err != nil { 587 shard.logger.Warningf("repairInsufficientGroupSize fails to grab lock for the shard %v: %v", formatKeyspaceShard(shard.KeyspaceShard), err) 588 return Noop, err 589 } 590 defer shard.UnlockShard() 591 shard.refreshTabletsInShardLocked(ctx) 592 status, err := shard.diagnoseLocked(ctx) 593 if err != nil { 594 shard.logger.Errorf("Failed to diagnose: %v", err) 595 return Fail, err 596 } 597 if status != DiagnoseTypeInsufficientGroupSize { 598 shard.logger.Infof("Shard %v is no longer in DiagnoseTypeInsufficientGroupSize: %v", formatKeyspaceShard(shard.KeyspaceShard), status) 599 return Noop, nil 600 } 601 // We check primary tablet is consistent with sql primary before InsufficientGroupSize 602 // therefore primary we found here is correct and healthy 603 primary := shard.findShardPrimaryTablet() 604 // Make sure we still hold the topo server lock before moving on 605 if err := shard.checkShardLocked(ctx); err != nil { 606 return Fail, err 607 } 608 // mysql group will set super_read_only properly automatically 609 // https://mysqlhighavailability.com/protecting-your-data-fail-safe-enhancements-to-group-replication/ 610 // since Vitess only knows one writable node (primary tablet) if we want to make sure there is no write 611 // after there is insufficient members, we can just set primary mysql node to be read only 612 err = shard.dbAgent.SetReadOnly(primary.instanceKey, true) 613 if err != nil { 614 return Fail, err 615 } 616 return Success, nil 617 } 618 619 func (shard *GRShard) repairReadOnlyShard(ctx context.Context) (RepairResultCode, error) { 620 ctx, err := shard.LockShard(ctx, "repairReadOnlyShard") 621 if err != nil { 622 shard.logger.Warningf("repairReadOnlyShard fails to grab lock for the shard %v: %v", formatKeyspaceShard(shard.KeyspaceShard), err) 623 return Noop, err 624 } 625 defer shard.UnlockShard() 626 shard.refreshTabletsInShardLocked(ctx) 627 status, err := shard.diagnoseLocked(ctx) 628 if err != nil { 629 shard.logger.Errorf("Failed to diagnose: %v", err) 630 return Fail, err 631 } 632 if status != DiagnoseTypeReadOnlyShard { 633 shard.logger.Infof("Shard %v is no longer in DiagnoseTypeReadOnlyShard: %v", formatKeyspaceShard(shard.KeyspaceShard), status) 634 return Noop, nil 635 } 636 primary := shard.findShardPrimaryTablet() 637 // Make sure we still hold the topo server lock before moving on 638 if err := shard.checkShardLocked(ctx); err != nil { 639 return Fail, err 640 } 641 // undo what we did repairInsufficientGroupSize 642 err = shard.dbAgent.SetReadOnly(primary.instanceKey, false) 643 if err != nil { 644 return Fail, err 645 } 646 return Success, nil 647 } 648 649 // Failover takes a shard and find an node with largest GTID as the mysql primary of the group 650 func (shard *GRShard) Failover(ctx context.Context) error { 651 ctx, err := shard.LockShard(ctx, "Failover") 652 if err != nil { 653 shard.logger.Warningf("Failover fails to grab lock for the shard %v: %v", formatKeyspaceShard(shard.KeyspaceShard), err) 654 return err 655 } 656 defer shard.UnlockShard() 657 shard.refreshTabletsInShardLocked(ctx) 658 return shard.failoverLocked(ctx) 659 } 660 661 func (shard *GRShard) failoverLocked(ctx context.Context) error { 662 candidate, err := shard.findFailoverCandidate(ctx) 663 if err != nil { 664 shard.logger.Errorf("Failed to find failover candidate: %v", err) 665 return err 666 } 667 // Make sure we still hold the topo server lock before moving on 668 if err := shard.checkShardLocked(ctx); err != nil { 669 return err 670 } 671 err = shard.dbAgent.Failover(candidate.instanceKey) 672 if err != nil { 673 shard.logger.Errorf("Failed to failover mysql to %v", candidate.alias) 674 return err 675 } 676 shard.logger.Infof("Successfully failover MySQL to %v for %v", candidate.instanceKey.Hostname, formatKeyspaceShard(shard.KeyspaceShard)) 677 if !shard.isActive.Get() { 678 shard.logger.Infof("Skip vttablet failover on an inactive shard %v", formatKeyspaceShard(shard.KeyspaceShard)) 679 return nil 680 } 681 // Make sure we still hold the topo server lock before moving on 682 if err := shard.checkShardLocked(ctx); err != nil { 683 return err 684 } 685 err = shard.tmc.ChangeType(ctx, candidate.tablet, topodatapb.TabletType_PRIMARY, false) 686 if err != nil { 687 shard.logger.Errorf("Failed to failover Vitess %v", candidate.alias) 688 return err 689 } 690 shard.logger.Infof("Successfully failover Vitess to %v for %v", candidate.alias, formatKeyspaceShard(shard.KeyspaceShard)) 691 return nil 692 } 693 694 func (shard *GRShard) findFailoverCandidateFromRecorder(ctx context.Context, recorder *groupGTIDRecorder, check func(context.Context, *grInstance) bool) (*grInstance, error) { 695 if len(recorder.gtidWithInstances) == 0 { 696 return nil, fmt.Errorf("empty failover candidate list for %v", formatKeyspaceShard(shard.KeyspaceShard)) 697 } 698 // Sort the gtidWithInstances slice so that we have consistent candidate 699 // in case they have same gtid set 700 recorder.sort() 701 for _, gtidInst := range recorder.gtidWithInstances { 702 shard.logger.Infof("[failover candidates] %s gtid %s", gtidInst.instance.alias, gtidInst.gtids.String()) 703 } 704 var largestGTIDs mysql.GTIDSet 705 var candidate *grInstance 706 var divergentCandidates []string 707 // All the instances in the recorder have a reachable mysqld 708 // hence anyone is a valid failover candidate 709 for _, elem := range recorder.gtidWithInstances { 710 gtids := elem.gtids 711 inst := elem.instance 712 if check != nil && !check(ctx, inst) { 713 shard.logger.Warningf("Skip %v as candidate with gtid %v because it failed the check", inst.alias, gtids.String()) 714 continue 715 } 716 if largestGTIDs == nil { 717 largestGTIDs = gtids 718 candidate = inst 719 continue 720 } 721 // If largestGTIDs is subset of current gtids, it means instance has larger GTID than candidate 722 // we need to swap them out 723 isSubset, isSuperset := compareGTIDSet(largestGTIDs, gtids) 724 if isSubset { 725 largestGTIDs = gtids 726 candidate = inst 727 continue 728 } 729 // largestGTIDs is neither subset nor super set of gtids 730 // we log and append to candidates so that we know there is a problem in the group 731 // after the iteration 732 if !isSuperset { 733 shard.logger.Errorf("FetchGroupView divergent GITD set from host=%v GTIDSet=%v", inst.instanceKey.Hostname, gtids) 734 divergentCandidates = append(divergentCandidates, inst.alias) 735 } 736 } 737 // unless GTID set diverged, the candidates should be empty 738 if len(divergentCandidates) > 0 { 739 divergentCandidates = append(divergentCandidates, candidate.alias) 740 return nil, fmt.Errorf("found more than one failover candidates by GTID set for %v: %v", formatKeyspaceShard(shard.KeyspaceShard), divergentCandidates) 741 } 742 return candidate, nil 743 } 744 745 func compareGTIDSet(set1, set2 mysql.GTIDSet) (bool, bool) { 746 isSubset := set2.Contains(set1) 747 // If set1 is subset of set2 we find a GTID super set and just need to record it 748 if isSubset { 749 return true, false 750 } 751 // If set1 is not a subset of set2 we need to see if set1 is actually a super set of set2 752 // this is to controller GTID set divergence 753 isSubset = set1.Contains(set2) 754 // We know set1 is not subset of set2 if set2 is also not subset of set1, it means 755 // there is a divergent in GTID sets 756 return false, isSubset 757 } 758 759 func (shard *GRShard) checkShardLocked(ctx context.Context) error { 760 if err := topo.CheckShardLocked(ctx, shard.KeyspaceShard.Keyspace, shard.KeyspaceShard.Shard); err != nil { 761 labels := []string{shard.KeyspaceShard.Keyspace, shard.KeyspaceShard.Shard} 762 unexpectedLockLost.Add(labels, 1) 763 shard.logger.Errorf("lost topology lock; aborting") 764 return vterrors.Wrap(err, "lost topology lock; aborting") 765 } 766 return nil 767 }