vitess.io/vitess@v0.16.2/go/vt/vtorc/logic/topology_recovery.go (about) 1 /* 2 Copyright 2015 Shlomi Noach, courtesy Booking.com 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package logic 18 19 import ( 20 "context" 21 "encoding/json" 22 "fmt" 23 "math/rand" 24 "strings" 25 "time" 26 27 "github.com/patrickmn/go-cache" 28 29 logutilpb "vitess.io/vitess/go/vt/proto/logutil" 30 topodatapb "vitess.io/vitess/go/vt/proto/topodata" 31 32 "vitess.io/vitess/go/stats" 33 "vitess.io/vitess/go/vt/log" 34 "vitess.io/vitess/go/vt/logutil" 35 "vitess.io/vitess/go/vt/topo/topoproto" 36 "vitess.io/vitess/go/vt/vtctl/reparentutil" 37 "vitess.io/vitess/go/vt/vtorc/config" 38 "vitess.io/vitess/go/vt/vtorc/inst" 39 "vitess.io/vitess/go/vt/vtorc/util" 40 "vitess.io/vitess/go/vt/vttablet/tmclient" 41 ) 42 43 type RecoveryType string 44 45 const ( 46 PrimaryRecovery RecoveryType = "PrimaryRecovery" 47 CoPrimaryRecovery RecoveryType = "CoPrimaryRecovery" 48 IntermediatePrimaryRecovery RecoveryType = "IntermediatePrimaryRecovery" 49 50 CheckAndRecoverGenericProblemRecoveryName string = "CheckAndRecoverGenericProblem" 51 RecoverDeadPrimaryRecoveryName string = "RecoverDeadPrimary" 52 RecoverPrimaryHasPrimaryRecoveryName string = "RecoverPrimaryHasPrimary" 53 CheckAndRecoverLockedSemiSyncPrimaryRecoveryName string = "CheckAndRecoverLockedSemiSyncPrimary" 54 ElectNewPrimaryRecoveryName string = "ElectNewPrimary" 55 FixPrimaryRecoveryName string = "FixPrimary" 56 FixReplicaRecoveryName string = "FixReplica" 57 ) 58 59 var ( 60 actionableRecoveriesNames = []string{ 61 RecoverDeadPrimaryRecoveryName, 62 RecoverPrimaryHasPrimaryRecoveryName, 63 ElectNewPrimaryRecoveryName, 64 FixPrimaryRecoveryName, 65 FixReplicaRecoveryName, 66 } 67 68 countPendingRecoveries = stats.NewGauge("PendingRecoveries", "Count of the number of pending recoveries") 69 70 // recoveriesCounter counts the number of recoveries that VTOrc has performed 71 recoveriesCounter = stats.NewCountersWithSingleLabel("RecoveriesCount", "Count of the different recoveries performed", "RecoveryType", actionableRecoveriesNames...) 72 73 // recoveriesSuccessfulCounter counts the number of successful recoveries that VTOrc has performed 74 recoveriesSuccessfulCounter = stats.NewCountersWithSingleLabel("SuccessfulRecoveries", "Count of the different successful recoveries performed", "RecoveryType", actionableRecoveriesNames...) 75 76 // recoveriesFailureCounter counts the number of failed recoveries that VTOrc has performed 77 recoveriesFailureCounter = stats.NewCountersWithSingleLabel("FailedRecoveries", "Count of the different failed recoveries performed", "RecoveryType", actionableRecoveriesNames...) 78 ) 79 80 // recoveryFunction is the code of the recovery function to be used 81 // this is returned from getCheckAndRecoverFunctionCode to compare the functions returned 82 // Each recoveryFunction is one to one mapped to a corresponding recovery. 83 type recoveryFunction int 84 85 const ( 86 noRecoveryFunc recoveryFunction = iota 87 recoverGenericProblemFunc 88 recoverDeadPrimaryFunc 89 recoverPrimaryHasPrimaryFunc 90 recoverLockedSemiSyncPrimaryFunc 91 electNewPrimaryFunc 92 fixPrimaryFunc 93 fixReplicaFunc 94 ) 95 96 type RecoveryAcknowledgement struct { 97 CreatedAt time.Time 98 Owner string 99 Comment string 100 101 Key inst.InstanceKey 102 ID int64 103 UID string 104 AllRecoveries bool 105 } 106 107 // BlockedTopologyRecovery represents an entry in the blocked_topology_recovery table 108 type BlockedTopologyRecovery struct { 109 FailedInstanceKey inst.InstanceKey 110 Analysis inst.AnalysisCode 111 LastBlockedTimestamp string 112 BlockingRecoveryID int64 113 } 114 115 // TopologyRecovery represents an entry in the topology_recovery table 116 type TopologyRecovery struct { 117 inst.PostponedFunctionsContainer 118 119 ID int64 120 UID string 121 AnalysisEntry inst.ReplicationAnalysis 122 SuccessorKey *inst.InstanceKey 123 SuccessorAlias string 124 IsActive bool 125 IsSuccessful bool 126 LostReplicas inst.InstanceKeyMap 127 ParticipatingInstanceKeys inst.InstanceKeyMap 128 AllErrors []string 129 RecoveryStartTimestamp string 130 RecoveryEndTimestamp string 131 ProcessingNodeHostname string 132 ProcessingNodeToken string 133 Acknowledged bool 134 AcknowledgedAt string 135 AcknowledgedBy string 136 AcknowledgedComment string 137 LastDetectionID int64 138 RelatedRecoveryID int64 139 Type RecoveryType 140 RecoveryType PrimaryRecoveryType 141 } 142 143 func NewTopologyRecovery(replicationAnalysis inst.ReplicationAnalysis) *TopologyRecovery { 144 topologyRecovery := &TopologyRecovery{} 145 topologyRecovery.UID = util.PrettyUniqueToken() 146 topologyRecovery.AnalysisEntry = replicationAnalysis 147 topologyRecovery.SuccessorKey = nil 148 topologyRecovery.LostReplicas = *inst.NewInstanceKeyMap() 149 topologyRecovery.ParticipatingInstanceKeys = *inst.NewInstanceKeyMap() 150 topologyRecovery.AllErrors = []string{} 151 topologyRecovery.RecoveryType = NotPrimaryRecovery 152 return topologyRecovery 153 } 154 155 func (topologyRecovery *TopologyRecovery) AddError(err error) error { 156 if err != nil { 157 topologyRecovery.AllErrors = append(topologyRecovery.AllErrors, err.Error()) 158 } 159 return err 160 } 161 162 func (topologyRecovery *TopologyRecovery) AddErrors(errs []error) { 163 for _, err := range errs { 164 _ = topologyRecovery.AddError(err) 165 } 166 } 167 168 type TopologyRecoveryStep struct { 169 ID int64 170 RecoveryUID string 171 AuditAt string 172 Message string 173 } 174 175 func NewTopologyRecoveryStep(uid string, message string) *TopologyRecoveryStep { 176 return &TopologyRecoveryStep{ 177 RecoveryUID: uid, 178 Message: message, 179 } 180 } 181 182 type PrimaryRecoveryType string 183 184 const ( 185 NotPrimaryRecovery PrimaryRecoveryType = "NotPrimaryRecovery" 186 PrimaryRecoveryGTID PrimaryRecoveryType = "PrimaryRecoveryGTID" 187 PrimaryRecoveryBinlogServer PrimaryRecoveryType = "PrimaryRecoveryBinlogServer" 188 PrimaryRecoveryUnknown PrimaryRecoveryType = "PrimaryRecoveryUnknown" 189 ) 190 191 var emergencyReadTopologyInstanceMap *cache.Cache 192 var emergencyRestartReplicaTopologyInstanceMap *cache.Cache 193 var emergencyOperationGracefulPeriodMap *cache.Cache 194 195 func init() { 196 go initializeTopologyRecoveryPostConfiguration() 197 } 198 199 func initializeTopologyRecoveryPostConfiguration() { 200 config.WaitForConfigurationToBeLoaded() 201 202 emergencyReadTopologyInstanceMap = cache.New(time.Second, time.Millisecond*250) 203 emergencyRestartReplicaTopologyInstanceMap = cache.New(time.Second*30, time.Second) 204 emergencyOperationGracefulPeriodMap = cache.New(time.Second*5, time.Millisecond*500) 205 } 206 207 // AuditTopologyRecovery audits a single step in a topology recovery process. 208 func AuditTopologyRecovery(topologyRecovery *TopologyRecovery, message string) error { 209 log.Infof("topology_recovery: %s", message) 210 if topologyRecovery == nil { 211 return nil 212 } 213 214 recoveryStep := NewTopologyRecoveryStep(topologyRecovery.UID, message) 215 return writeTopologyRecoveryStep(recoveryStep) 216 } 217 218 func resolveRecovery(topologyRecovery *TopologyRecovery, successorInstance *inst.Instance) error { 219 if successorInstance != nil { 220 topologyRecovery.SuccessorKey = &successorInstance.Key 221 topologyRecovery.SuccessorAlias = successorInstance.InstanceAlias 222 topologyRecovery.IsSuccessful = true 223 } 224 return writeResolveRecovery(topologyRecovery) 225 } 226 227 // recoverPrimaryHasPrimary resets the replication on the primary instance 228 func recoverPrimaryHasPrimary(ctx context.Context, analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { 229 topologyRecovery, err = AttemptRecoveryRegistration(&analysisEntry, false, true) 230 if topologyRecovery == nil { 231 _ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another fixPrimaryHasPrimary.", analysisEntry.AnalyzedInstanceKey)) 232 return false, nil, err 233 } 234 log.Infof("Analysis: %v, will fix incorrect primaryship %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey) 235 // This has to be done in the end; whether successful or not, we should mark that the recovery is done. 236 // So that after the active period passes, we are able to run other recoveries. 237 defer func() { 238 _ = resolveRecovery(topologyRecovery, nil) 239 }() 240 241 // Reset replication on current primary. 242 err = inst.ResetReplicationParameters(analysisEntry.AnalyzedInstanceKey) 243 if err != nil { 244 return false, topologyRecovery, err 245 } 246 return true, topologyRecovery, nil 247 } 248 249 // recoverDeadPrimary checks a given analysis, decides whether to take action, and possibly takes action 250 // Returns true when action was taken. 251 func recoverDeadPrimary(ctx context.Context, analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { 252 if !(forceInstanceRecovery || analysisEntry.ClusterDetails.HasAutomatedPrimaryRecovery) { 253 return false, nil, nil 254 } 255 256 // Read the tablet information from the database to find the shard and keyspace of the tablet 257 tablet, err := inst.ReadTablet(analysisEntry.AnalyzedInstanceKey) 258 if err != nil { 259 return false, nil, err 260 } 261 262 var candidateTabletAlias *topodatapb.TabletAlias 263 if candidateInstanceKey != nil { 264 candidateTablet, err := inst.ReadTablet(*candidateInstanceKey) 265 if err != nil { 266 return false, nil, err 267 } 268 candidateTabletAlias = candidateTablet.Alias 269 } 270 topologyRecovery, err = AttemptRecoveryRegistration(&analysisEntry, !forceInstanceRecovery, !forceInstanceRecovery) 271 if topologyRecovery == nil { 272 _ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another RecoverDeadPrimary.", analysisEntry.AnalyzedInstanceKey)) 273 return false, nil, err 274 } 275 log.Infof("Analysis: %v, deadprimary %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey) 276 var promotedReplica *inst.Instance 277 // This has to be done in the end; whether successful or not, we should mark that the recovery is done. 278 // So that after the active period passes, we are able to run other recoveries. 279 defer func() { 280 _ = resolveRecovery(topologyRecovery, promotedReplica) 281 }() 282 283 ev, err := reparentutil.NewEmergencyReparenter(ts, tmclient.NewTabletManagerClient(), logutil.NewCallbackLogger(func(event *logutilpb.Event) { 284 level := event.GetLevel() 285 value := event.GetValue() 286 // we only log the warnings and errors explicitly, everything gets logged as an information message anyways in auditing topology recovery 287 switch level { 288 case logutilpb.Level_WARNING: 289 log.Warningf("ERS - %s", value) 290 case logutilpb.Level_ERROR: 291 log.Errorf("ERS - %s", value) 292 default: 293 log.Infof("ERS - %s", value) 294 } 295 _ = AuditTopologyRecovery(topologyRecovery, value) 296 })).ReparentShard(ctx, 297 tablet.Keyspace, 298 tablet.Shard, 299 reparentutil.EmergencyReparentOptions{ 300 NewPrimaryAlias: candidateTabletAlias, 301 IgnoreReplicas: nil, 302 WaitReplicasTimeout: time.Duration(config.Config.WaitReplicasTimeoutSeconds) * time.Second, 303 PreventCrossCellPromotion: config.Config.PreventCrossDataCenterPrimaryFailover, 304 }, 305 ) 306 if err != nil { 307 log.Errorf("Error running ERS - %v", err) 308 } 309 310 if ev != nil && ev.NewPrimary != nil { 311 promotedReplica, _, _ = inst.ReadInstance(&inst.InstanceKey{ 312 Hostname: ev.NewPrimary.MysqlHostname, 313 Port: int(ev.NewPrimary.MysqlPort), 314 }) 315 } 316 postErsCompletion(topologyRecovery, analysisEntry, skipProcesses, promotedReplica) 317 return true, topologyRecovery, err 318 } 319 320 func postErsCompletion(topologyRecovery *TopologyRecovery, analysisEntry inst.ReplicationAnalysis, skipProcesses bool, promotedReplica *inst.Instance) { 321 if promotedReplica != nil { 322 message := fmt.Sprintf("promoted replica: %+v", promotedReplica.Key) 323 _ = AuditTopologyRecovery(topologyRecovery, message) 324 _ = inst.AuditOperation("recover-dead-primary", &analysisEntry.AnalyzedInstanceKey, message) 325 } 326 // Now, see whether we are successful or not. From this point there's no going back. 327 if promotedReplica != nil { 328 // Success! 329 _ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("RecoverDeadPrimary: successfully promoted %+v", promotedReplica.Key)) 330 } 331 } 332 333 // checkAndRecoverGenericProblem is a general-purpose recovery function 334 func checkAndRecoverLockedSemiSyncPrimary(ctx context.Context, analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { 335 return false, nil, nil 336 } 337 338 // checkAndRecoverGenericProblem is a general-purpose recovery function 339 func checkAndRecoverGenericProblem(ctx context.Context, analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (bool, *TopologyRecovery, error) { 340 return false, nil, nil 341 } 342 343 // Force a re-read of a topology instance; this is done because we need to substantiate a suspicion 344 // that we may have a failover scenario. we want to speed up reading the complete picture. 345 func emergentlyReadTopologyInstance(instanceKey *inst.InstanceKey, analysisCode inst.AnalysisCode) (instance *inst.Instance) { 346 if existsInCacheError := emergencyReadTopologyInstanceMap.Add(instanceKey.StringCode(), true, cache.DefaultExpiration); existsInCacheError != nil { 347 // Just recently attempted 348 return nil 349 } 350 instance, _ = inst.ReadTopologyInstance(instanceKey) 351 _ = inst.AuditOperation("emergently-read-topology-instance", instanceKey, string(analysisCode)) 352 return instance 353 } 354 355 // Force reading of replicas of given instance. This is because we suspect the instance is dead, and want to speed up 356 // detection of replication failure from its replicas. 357 func emergentlyReadTopologyInstanceReplicas(instanceKey *inst.InstanceKey, analysisCode inst.AnalysisCode) { 358 replicas, err := inst.ReadReplicaInstancesIncludingBinlogServerSubReplicas(instanceKey) 359 if err != nil { 360 return 361 } 362 for _, replica := range replicas { 363 go emergentlyReadTopologyInstance(&replica.Key, analysisCode) 364 } 365 } 366 367 // emergentlyRestartReplicationOnTopologyInstance forces a RestartReplication on a given instance. 368 func emergentlyRestartReplicationOnTopologyInstance(instanceKey *inst.InstanceKey, analysisCode inst.AnalysisCode) { 369 if existsInCacheError := emergencyRestartReplicaTopologyInstanceMap.Add(instanceKey.StringCode(), true, cache.DefaultExpiration); existsInCacheError != nil { 370 // Just recently attempted on this specific replica 371 return 372 } 373 go inst.ExecuteOnTopology(func() { 374 _ = restartReplication(instanceKey) 375 _ = inst.AuditOperation("emergently-restart-replication-topology-instance", instanceKey, string(analysisCode)) 376 }) 377 } 378 379 func beginEmergencyOperationGracefulPeriod(instanceKey *inst.InstanceKey) { 380 emergencyOperationGracefulPeriodMap.Set(instanceKey.StringCode(), true, cache.DefaultExpiration) 381 } 382 383 func isInEmergencyOperationGracefulPeriod(instanceKey *inst.InstanceKey) bool { 384 _, found := emergencyOperationGracefulPeriodMap.Get(instanceKey.StringCode()) 385 return found 386 } 387 388 // emergentlyRestartReplicationOnTopologyInstanceReplicas forces a stop slave + start slave on 389 // replicas of a given instance, in an attempt to cause them to re-evaluate their replication state. 390 // This can be useful in scenarios where the primary has Too Many Connections, but long-time connected 391 // replicas are not seeing this; when they stop+start replication, they need to re-authenticate and 392 // that's where we hope they realize the primary is bad. 393 func emergentlyRestartReplicationOnTopologyInstanceReplicas(instanceKey *inst.InstanceKey, analysisCode inst.AnalysisCode) { 394 if existsInCacheError := emergencyRestartReplicaTopologyInstanceMap.Add(instanceKey.StringCode(), true, cache.DefaultExpiration); existsInCacheError != nil { 395 // While each replica's RestartReplication() is throttled on its own, it's also wasteful to 396 // iterate all replicas all the time. This is the reason why we do grand-throttle check. 397 return 398 } 399 beginEmergencyOperationGracefulPeriod(instanceKey) 400 401 replicas, err := inst.ReadReplicaInstancesIncludingBinlogServerSubReplicas(instanceKey) 402 if err != nil { 403 return 404 } 405 for _, replica := range replicas { 406 replicaKey := &replica.Key 407 go emergentlyRestartReplicationOnTopologyInstance(replicaKey, analysisCode) 408 } 409 } 410 411 func emergentlyRecordStaleBinlogCoordinates(instanceKey *inst.InstanceKey, binlogCoordinates *inst.BinlogCoordinates) { 412 err := inst.RecordStaleInstanceBinlogCoordinates(instanceKey, binlogCoordinates) 413 if err != nil { 414 log.Error(err) 415 } 416 } 417 418 // checkAndExecuteFailureDetectionProcesses tries to register for failure detection and potentially executes 419 // failure-detection processes. 420 func checkAndExecuteFailureDetectionProcesses(analysisEntry inst.ReplicationAnalysis, skipProcesses bool) (detectionRegistrationSuccess bool, processesExecutionAttempted bool, err error) { 421 if ok, _ := AttemptFailureDetectionRegistration(&analysisEntry); !ok { 422 if util.ClearToLog("checkAndExecuteFailureDetectionProcesses", analysisEntry.AnalyzedInstanceKey.StringCode()) { 423 log.Infof("checkAndExecuteFailureDetectionProcesses: could not register %+v detection on %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey) 424 } 425 return false, false, nil 426 } 427 log.Infof("topology_recovery: detected %+v failure on %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey) 428 return true, false, nil 429 } 430 431 // getCheckAndRecoverFunctionCode gets the recovery function code to use for the given analysis. 432 func getCheckAndRecoverFunctionCode(analysisCode inst.AnalysisCode, analyzedInstanceKey *inst.InstanceKey) recoveryFunction { 433 switch analysisCode { 434 // primary 435 case inst.DeadPrimary, inst.DeadPrimaryAndSomeReplicas: 436 if isInEmergencyOperationGracefulPeriod(analyzedInstanceKey) { 437 return recoverGenericProblemFunc 438 } 439 return recoverDeadPrimaryFunc 440 case inst.PrimaryHasPrimary: 441 return recoverPrimaryHasPrimaryFunc 442 case inst.LockedSemiSyncPrimary: 443 if isInEmergencyOperationGracefulPeriod(analyzedInstanceKey) { 444 return recoverGenericProblemFunc 445 } 446 return recoverLockedSemiSyncPrimaryFunc 447 case inst.ClusterHasNoPrimary: 448 return electNewPrimaryFunc 449 case inst.PrimaryIsReadOnly, inst.PrimarySemiSyncMustBeSet, inst.PrimarySemiSyncMustNotBeSet: 450 return fixPrimaryFunc 451 // replica 452 case inst.NotConnectedToPrimary, inst.ConnectedToWrongPrimary, inst.ReplicationStopped, inst.ReplicaIsWritable, 453 inst.ReplicaSemiSyncMustBeSet, inst.ReplicaSemiSyncMustNotBeSet: 454 return fixReplicaFunc 455 // primary, non actionable 456 case inst.DeadPrimaryAndReplicas: 457 return recoverGenericProblemFunc 458 case inst.UnreachablePrimary: 459 return recoverGenericProblemFunc 460 case inst.UnreachablePrimaryWithLaggingReplicas: 461 return recoverGenericProblemFunc 462 case inst.AllPrimaryReplicasNotReplicating: 463 return recoverGenericProblemFunc 464 case inst.AllPrimaryReplicasNotReplicatingOrDead: 465 return recoverGenericProblemFunc 466 } 467 // Right now this is mostly causing noise with no clear action. 468 // Will revisit this in the future. 469 // case inst.AllPrimaryReplicasStale: 470 // return recoverGenericProblemFunc 471 472 return noRecoveryFunc 473 } 474 475 // hasActionableRecovery tells if a recoveryFunction has an actionable recovery or not 476 func hasActionableRecovery(recoveryFunctionCode recoveryFunction) bool { 477 switch recoveryFunctionCode { 478 case noRecoveryFunc: 479 return false 480 case recoverGenericProblemFunc: 481 return false 482 case recoverDeadPrimaryFunc: 483 return true 484 case recoverPrimaryHasPrimaryFunc: 485 return true 486 case recoverLockedSemiSyncPrimaryFunc: 487 return true 488 case electNewPrimaryFunc: 489 return true 490 case fixPrimaryFunc: 491 return true 492 case fixReplicaFunc: 493 return true 494 default: 495 return false 496 } 497 } 498 499 // getCheckAndRecoverFunction gets the recovery function for the given code. 500 func getCheckAndRecoverFunction(recoveryFunctionCode recoveryFunction) ( 501 checkAndRecoverFunction func(ctx context.Context, analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error), 502 ) { 503 switch recoveryFunctionCode { 504 case noRecoveryFunc: 505 return nil 506 case recoverGenericProblemFunc: 507 return checkAndRecoverGenericProblem 508 case recoverDeadPrimaryFunc: 509 return recoverDeadPrimary 510 case recoverPrimaryHasPrimaryFunc: 511 return recoverPrimaryHasPrimary 512 case recoverLockedSemiSyncPrimaryFunc: 513 return checkAndRecoverLockedSemiSyncPrimary 514 case electNewPrimaryFunc: 515 return electNewPrimary 516 case fixPrimaryFunc: 517 return fixPrimary 518 case fixReplicaFunc: 519 return fixReplica 520 default: 521 return nil 522 } 523 } 524 525 // getRecoverFunctionName gets the recovery function name for the given code. 526 // This name is used for metrics 527 func getRecoverFunctionName(recoveryFunctionCode recoveryFunction) string { 528 switch recoveryFunctionCode { 529 case noRecoveryFunc: 530 return "" 531 case recoverGenericProblemFunc: 532 return CheckAndRecoverGenericProblemRecoveryName 533 case recoverDeadPrimaryFunc: 534 return RecoverDeadPrimaryRecoveryName 535 case recoverPrimaryHasPrimaryFunc: 536 return RecoverPrimaryHasPrimaryRecoveryName 537 case recoverLockedSemiSyncPrimaryFunc: 538 return CheckAndRecoverLockedSemiSyncPrimaryRecoveryName 539 case electNewPrimaryFunc: 540 return ElectNewPrimaryRecoveryName 541 case fixPrimaryFunc: 542 return FixPrimaryRecoveryName 543 case fixReplicaFunc: 544 return FixReplicaRecoveryName 545 default: 546 return "" 547 } 548 } 549 550 // isClusterWideRecovery returns whether the given recovery is a cluster-wide recovery or not 551 func isClusterWideRecovery(recoveryFunctionCode recoveryFunction) bool { 552 switch recoveryFunctionCode { 553 case recoverDeadPrimaryFunc, electNewPrimaryFunc: 554 return true 555 default: 556 return false 557 } 558 } 559 560 // analysisEntriesHaveSameRecovery tells whether the two analysis entries have the same recovery function or not 561 func analysisEntriesHaveSameRecovery(prevAnalysis, newAnalysis inst.ReplicationAnalysis) bool { 562 prevRecoveryFunctionCode := getCheckAndRecoverFunctionCode(prevAnalysis.Analysis, &prevAnalysis.AnalyzedInstanceKey) 563 newRecoveryFunctionCode := getCheckAndRecoverFunctionCode(newAnalysis.Analysis, &newAnalysis.AnalyzedInstanceKey) 564 return prevRecoveryFunctionCode == newRecoveryFunctionCode 565 } 566 567 func runEmergentOperations(analysisEntry *inst.ReplicationAnalysis) { 568 switch analysisEntry.Analysis { 569 case inst.DeadPrimaryAndReplicas: 570 go emergentlyReadTopologyInstance(&analysisEntry.AnalyzedInstancePrimaryKey, analysisEntry.Analysis) 571 case inst.UnreachablePrimary: 572 go emergentlyReadTopologyInstance(&analysisEntry.AnalyzedInstanceKey, analysisEntry.Analysis) 573 go emergentlyReadTopologyInstanceReplicas(&analysisEntry.AnalyzedInstanceKey, analysisEntry.Analysis) 574 case inst.UnreachablePrimaryWithLaggingReplicas: 575 go emergentlyRestartReplicationOnTopologyInstanceReplicas(&analysisEntry.AnalyzedInstanceKey, analysisEntry.Analysis) 576 case inst.LockedSemiSyncPrimaryHypothesis: 577 go emergentlyReadTopologyInstance(&analysisEntry.AnalyzedInstanceKey, analysisEntry.Analysis) 578 go emergentlyRecordStaleBinlogCoordinates(&analysisEntry.AnalyzedInstanceKey, &analysisEntry.AnalyzedInstanceBinlogCoordinates) 579 case inst.AllPrimaryReplicasNotReplicating: 580 go emergentlyReadTopologyInstance(&analysisEntry.AnalyzedInstanceKey, analysisEntry.Analysis) 581 case inst.AllPrimaryReplicasNotReplicatingOrDead: 582 go emergentlyReadTopologyInstance(&analysisEntry.AnalyzedInstanceKey, analysisEntry.Analysis) 583 } 584 } 585 586 // executeCheckAndRecoverFunction will choose the correct check & recovery function based on analysis. 587 // It executes the function synchronuously 588 func executeCheckAndRecoverFunction(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { 589 countPendingRecoveries.Add(1) 590 defer countPendingRecoveries.Add(-1) 591 592 checkAndRecoverFunctionCode := getCheckAndRecoverFunctionCode(analysisEntry.Analysis, &analysisEntry.AnalyzedInstanceKey) 593 isActionableRecovery := hasActionableRecovery(checkAndRecoverFunctionCode) 594 analysisEntry.IsActionableRecovery = isActionableRecovery 595 runEmergentOperations(&analysisEntry) 596 597 if checkAndRecoverFunctionCode == noRecoveryFunc { 598 // Unhandled problem type 599 if analysisEntry.Analysis != inst.NoProblem { 600 if util.ClearToLog("executeCheckAndRecoverFunction", analysisEntry.AnalyzedInstanceKey.StringCode()) { 601 log.Warningf("executeCheckAndRecoverFunction: ignoring analysisEntry that has no action plan: %+v; key: %+v", 602 analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey) 603 } 604 } 605 606 return false, nil, nil 607 } 608 // we have a recovery function; its execution still depends on filters if not disabled. 609 if isActionableRecovery || util.ClearToLog("executeCheckAndRecoverFunction: detection", analysisEntry.AnalyzedInstanceKey.StringCode()) { 610 log.Infof("executeCheckAndRecoverFunction: proceeding with %+v detection on %+v; isActionable?: %+v; skipProcesses: %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey, isActionableRecovery, skipProcesses) 611 } 612 613 // At this point we have validated there's a failure scenario for which we have a recovery path. 614 615 // Initiate detection: 616 _, _, err = checkAndExecuteFailureDetectionProcesses(analysisEntry, skipProcesses) 617 if err != nil { 618 log.Errorf("executeCheckAndRecoverFunction: error on failure detection: %+v", err) 619 return false, nil, err 620 } 621 // We don't mind whether detection really executed the processes or not 622 // (it may have been silenced due to previous detection). We only care there's no error. 623 624 // We're about to embark on recovery shortly... 625 626 // Check for recovery being disabled globally 627 if recoveryDisabledGlobally, err := IsRecoveryDisabled(); err != nil { 628 // Unexpected. Shouldn't get this 629 log.Errorf("Unable to determine if recovery is disabled globally: %v", err) 630 } else if recoveryDisabledGlobally { 631 if !forceInstanceRecovery { 632 log.Infof("CheckAndRecover: Analysis: %+v, InstanceKey: %+v, candidateInstanceKey: %+v, "+ 633 "skipProcesses: %v: NOT Recovering host (disabled globally)", 634 analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey, candidateInstanceKey, skipProcesses) 635 636 return false, nil, err 637 } 638 log.Infof("CheckAndRecover: Analysis: %+v, InstanceKey: %+v, candidateInstanceKey: %+v, "+ 639 "skipProcesses: %v: recoveries disabled globally but forcing this recovery", 640 analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey, candidateInstanceKey, skipProcesses) 641 } 642 643 // We lock the shard here and then refresh the tablets information 644 ctx, unlock, err := LockShard(context.Background(), analysisEntry.AnalyzedInstanceKey) 645 if err != nil { 646 return false, nil, err 647 } 648 defer unlock(&err) 649 650 // Check if the recovery is already fixed or not. We need this because vtorc works on ephemeral data to find the failure scenarios. 651 // That data might be old, because of a cluster operation that was run through vtctld or some other vtorc. So before we do any 652 // changes, we should be checking that this failure is indeed needed to be fixed. We do this after locking the shard to be sure 653 // that the data that we use now is up-to-date. 654 if isActionableRecovery { 655 // The first step we have to do is refresh the keyspace information 656 // This is required to know if the durability policies have changed or not 657 // If they have, then recoveries like ReplicaSemiSyncMustNotBeSet, etc won't be valid anymore 658 err := RefreshKeyspace(analysisEntry.AnalyzedKeyspace) 659 if err != nil { 660 return false, nil, err 661 } 662 // If we are about to run a cluster-wide recovery, it is imperative to first refresh all the tablets 663 // of a shard because a new tablet could have been promoted, and we need to have this visibility before we 664 // run a cluster operation of our own. 665 if isClusterWideRecovery(checkAndRecoverFunctionCode) { 666 forceRefreshAllTabletsInShard(ctx, analysisEntry.AnalyzedKeyspace, analysisEntry.AnalyzedShard) 667 } else { 668 // If we are not running a cluster-wide recovery, then it is only concerned with the specific tablet 669 // on which the failure occurred and the primary instance of the shard. 670 // For example, ConnectedToWrongPrimary analysis only cares for whom the current primary tablet is 671 // and the host-port set on the tablet in question. 672 // So, we only need to refresh the tablet info records (to know if the primary tablet has changed), 673 // and the replication data of the new primary and this tablet. 674 refreshTabletInfoOfShard(ctx, analysisEntry.AnalyzedKeyspace, analysisEntry.AnalyzedShard) 675 DiscoverInstance(analysisEntry.AnalyzedInstanceKey, true) 676 primaryTablet, err := shardPrimary(analysisEntry.AnalyzedKeyspace, analysisEntry.AnalyzedShard) 677 if err != nil { 678 log.Errorf("executeCheckAndRecoverFunction: Analysis: %+v, InstanceKey: %+v, candidateInstanceKey: %+v, "+"skipProcesses: %v: error while finding the shard primary: %v", 679 analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey, candidateInstanceKey, skipProcesses, err) 680 return false, nil, err 681 } 682 primaryInstanceKey := inst.InstanceKey{ 683 Hostname: primaryTablet.MysqlHostname, 684 Port: int(primaryTablet.MysqlPort), 685 } 686 // We can skip the refresh if we know the tablet we are looking at is the primary tablet. 687 // This would be the case for PrimaryHasPrimary recovery. We don't need to refresh the same tablet twice. 688 if !analysisEntry.AnalyzedInstanceKey.Equals(&primaryInstanceKey) { 689 DiscoverInstance(primaryInstanceKey, true) 690 } 691 } 692 alreadyFixed, err := checkIfAlreadyFixed(analysisEntry) 693 if err != nil { 694 log.Errorf("executeCheckAndRecoverFunction: Analysis: %+v, InstanceKey: %+v, candidateInstanceKey: %+v, "+"skipProcesses: %v: error while trying to find if the problem is already fixed: %v", 695 analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey, candidateInstanceKey, skipProcesses, err) 696 return false, nil, err 697 } 698 if alreadyFixed { 699 log.Infof("Analysis: %v - No longer valid, some other agent must have fixed the problem.", analysisEntry.Analysis) 700 return false, nil, nil 701 } 702 } 703 704 // Actually attempt recovery: 705 if isActionableRecovery || util.ClearToLog("executeCheckAndRecoverFunction: recovery", analysisEntry.AnalyzedInstanceKey.StringCode()) { 706 log.Infof("executeCheckAndRecoverFunction: proceeding with %+v recovery on %+v; isRecoverable?: %+v; skipProcesses: %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey, isActionableRecovery, skipProcesses) 707 } 708 recoveryAttempted, topologyRecovery, err = getCheckAndRecoverFunction(checkAndRecoverFunctionCode)(ctx, analysisEntry, candidateInstanceKey, forceInstanceRecovery, skipProcesses) 709 if !recoveryAttempted { 710 return recoveryAttempted, topologyRecovery, err 711 } 712 recoveryName := getRecoverFunctionName(checkAndRecoverFunctionCode) 713 recoveriesCounter.Add(recoveryName, 1) 714 if err != nil { 715 recoveriesFailureCounter.Add(recoveryName, 1) 716 } else { 717 recoveriesSuccessfulCounter.Add(recoveryName, 1) 718 } 719 if topologyRecovery == nil { 720 return recoveryAttempted, topologyRecovery, err 721 } 722 if b, err := json.Marshal(topologyRecovery); err == nil { 723 log.Infof("Topology recovery: %+v", string(b)) 724 } else { 725 log.Infof("Topology recovery: %+v", topologyRecovery) 726 } 727 // If we ran a cluster wide recovery and actually attemped it, then we know that the replication state for all the tablets in this cluster 728 // would have changed. So we can go ahead and pre-emptively refresh them. 729 // For this refresh we don't use the same context that we used for the recovery, since that context might have expired or could expire soon 730 // Instead we pass the background context. The call forceRefreshAllTabletsInShard handles adding a timeout to it for us. 731 if isClusterWideRecovery(checkAndRecoverFunctionCode) { 732 forceRefreshAllTabletsInShard(context.Background(), analysisEntry.AnalyzedKeyspace, analysisEntry.AnalyzedShard) 733 } else { 734 // For all other recoveries, we would have changed the replication status of the analyzed tablet 735 // so it doesn't hurt to re-read the information of this tablet, otherwise we'll requeue the same recovery 736 // that we just completed because we would be using stale data. 737 DiscoverInstance(analysisEntry.AnalyzedInstanceKey, true) 738 } 739 _ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("Waiting for %d postponed functions", topologyRecovery.PostponedFunctionsContainer.Len())) 740 topologyRecovery.Wait() 741 _ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("Executed %d postponed functions", topologyRecovery.PostponedFunctionsContainer.Len())) 742 if topologyRecovery.PostponedFunctionsContainer.Len() > 0 { 743 _ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("Executed postponed functions: %+v", strings.Join(topologyRecovery.PostponedFunctionsContainer.Descriptions(), ", "))) 744 } 745 return recoveryAttempted, topologyRecovery, err 746 } 747 748 // checkIfAlreadyFixed checks whether the problem that the analysis entry represents has already been fixed by another agent or not 749 func checkIfAlreadyFixed(analysisEntry inst.ReplicationAnalysis) (bool, error) { 750 // Run a replication analysis again. We will check if the problem persisted 751 analysisEntries, err := inst.GetReplicationAnalysis(analysisEntry.ClusterDetails.Keyspace, analysisEntry.ClusterDetails.Shard, &inst.ReplicationAnalysisHints{}) 752 if err != nil { 753 return false, err 754 } 755 756 for _, entry := range analysisEntries { 757 // If there is a analysis which has the same recovery required, then we should proceed with the recovery 758 if entry.AnalyzedInstanceKey.Equals(&analysisEntry.AnalyzedInstanceKey) && analysisEntriesHaveSameRecovery(analysisEntry, entry) { 759 return false, nil 760 } 761 } 762 763 // We didn't find a replication analysis matching the original failure, which means that some other agent probably fixed it. 764 return true, nil 765 } 766 767 // CheckAndRecover is the main entry point for the recovery mechanism 768 func CheckAndRecover(specificInstance *inst.InstanceKey, candidateInstanceKey *inst.InstanceKey, skipProcesses bool) (recoveryAttempted bool, promotedReplicaKey *inst.InstanceKey, err error) { 769 // Allow the analysis to run even if we don't want to recover 770 replicationAnalysis, err := inst.GetReplicationAnalysis("", "", &inst.ReplicationAnalysisHints{IncludeDowntimed: true, AuditAnalysis: true}) 771 if err != nil { 772 log.Error(err) 773 return false, nil, err 774 } 775 // intentionally iterating entries in random order 776 for _, j := range rand.Perm(len(replicationAnalysis)) { 777 analysisEntry := replicationAnalysis[j] 778 if specificInstance != nil { 779 // We are looking for a specific instance; if this is not the one, skip! 780 if !specificInstance.Equals(&analysisEntry.AnalyzedInstanceKey) { 781 continue 782 } 783 } 784 if analysisEntry.SkippableDueToDowntime && specificInstance == nil { 785 // Only recover a downtimed server if explicitly requested 786 continue 787 } 788 789 if specificInstance != nil { 790 // force mode. Keep it synchronuous 791 var topologyRecovery *TopologyRecovery 792 recoveryAttempted, topologyRecovery, err = executeCheckAndRecoverFunction(analysisEntry, candidateInstanceKey, true, skipProcesses) 793 if err != nil { 794 log.Error(err) 795 } 796 if topologyRecovery != nil { 797 promotedReplicaKey = topologyRecovery.SuccessorKey 798 } 799 } else { 800 go func() { 801 _, _, err := executeCheckAndRecoverFunction(analysisEntry, candidateInstanceKey, false, skipProcesses) 802 if err != nil { 803 log.Error(err) 804 } 805 }() 806 } 807 } 808 return recoveryAttempted, promotedReplicaKey, err 809 } 810 811 func postPrsCompletion(topologyRecovery *TopologyRecovery, analysisEntry inst.ReplicationAnalysis, promotedReplica *inst.Instance) { 812 if promotedReplica != nil { 813 message := fmt.Sprintf("promoted replica: %+v", promotedReplica.Key) 814 _ = AuditTopologyRecovery(topologyRecovery, message) 815 _ = inst.AuditOperation(string(analysisEntry.Analysis), &analysisEntry.AnalyzedInstanceKey, message) 816 } 817 // Now, see whether we are successful or not. From this point there's no going back. 818 if promotedReplica != nil { 819 // Success! 820 _ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("%+v: successfully promoted %+v", analysisEntry.Analysis, promotedReplica.Key)) 821 } 822 } 823 824 // electNewPrimary elects a new primary while none were present before. 825 func electNewPrimary(ctx context.Context, analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { 826 topologyRecovery, err = AttemptRecoveryRegistration(&analysisEntry, false /*failIfFailedInstanceInActiveRecovery*/, true /*failIfClusterInActiveRecovery*/) 827 if topologyRecovery == nil || err != nil { 828 _ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another electNewPrimary.", analysisEntry.AnalyzedInstanceKey)) 829 return false, nil, err 830 } 831 log.Infof("Analysis: %v, will elect a new primary for %v:%v", analysisEntry.Analysis, analysisEntry.ClusterDetails.Keyspace, analysisEntry.ClusterDetails.Shard) 832 833 var promotedReplica *inst.Instance 834 // This has to be done in the end; whether successful or not, we should mark that the recovery is done. 835 // So that after the active period passes, we are able to run other recoveries. 836 defer func() { 837 _ = resolveRecovery(topologyRecovery, promotedReplica) 838 }() 839 840 analyzedTablet, err := inst.ReadTablet(analysisEntry.AnalyzedInstanceKey) 841 if err != nil { 842 return false, topologyRecovery, err 843 } 844 _ = AuditTopologyRecovery(topologyRecovery, "starting PlannedReparentShard for electing new primary.") 845 846 ev, err := reparentutil.NewPlannedReparenter(ts, tmclient.NewTabletManagerClient(), logutil.NewCallbackLogger(func(event *logutilpb.Event) { 847 level := event.GetLevel() 848 value := event.GetValue() 849 // we only log the warnings and errors explicitly, everything gets logged as an information message anyways in auditing topology recovery 850 switch level { 851 case logutilpb.Level_WARNING: 852 log.Warningf("PRS - %s", value) 853 case logutilpb.Level_ERROR: 854 log.Errorf("PRS - %s", value) 855 } 856 _ = AuditTopologyRecovery(topologyRecovery, value) 857 })).ReparentShard(ctx, 858 analyzedTablet.Keyspace, 859 analyzedTablet.Shard, 860 reparentutil.PlannedReparentOptions{ 861 WaitReplicasTimeout: time.Duration(config.Config.WaitReplicasTimeoutSeconds) * time.Second, 862 }, 863 ) 864 865 if ev != nil && ev.NewPrimary != nil { 866 promotedReplica, _, _ = inst.ReadInstance(&inst.InstanceKey{ 867 Hostname: ev.NewPrimary.MysqlHostname, 868 Port: int(ev.NewPrimary.MysqlPort), 869 }) 870 } 871 postPrsCompletion(topologyRecovery, analysisEntry, promotedReplica) 872 return true, topologyRecovery, err 873 } 874 875 // fixPrimary sets the primary as read-write. 876 func fixPrimary(ctx context.Context, analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { 877 topologyRecovery, err = AttemptRecoveryRegistration(&analysisEntry, false, true) 878 if topologyRecovery == nil { 879 _ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another fixPrimary.", analysisEntry.AnalyzedInstanceKey)) 880 return false, nil, err 881 } 882 log.Infof("Analysis: %v, will fix primary to read-write %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey) 883 // This has to be done in the end; whether successful or not, we should mark that the recovery is done. 884 // So that after the active period passes, we are able to run other recoveries. 885 defer func() { 886 _ = resolveRecovery(topologyRecovery, nil) 887 }() 888 889 analyzedTablet, err := inst.ReadTablet(analysisEntry.AnalyzedInstanceKey) 890 if err != nil { 891 return false, topologyRecovery, err 892 } 893 894 durabilityPolicy, err := inst.GetDurabilityPolicy(analyzedTablet) 895 if err != nil { 896 log.Info("Could not read the durability policy for %v/%v", analyzedTablet.Keyspace, analyzedTablet.Shard) 897 return false, topologyRecovery, err 898 } 899 900 if err := tabletUndoDemotePrimary(ctx, analyzedTablet, inst.SemiSyncAckers(durabilityPolicy, analyzedTablet) > 0); err != nil { 901 return true, topologyRecovery, err 902 } 903 return true, topologyRecovery, nil 904 } 905 906 // fixReplica sets the replica as read-only and points it at the current primary. 907 func fixReplica(ctx context.Context, analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { 908 topologyRecovery, err = AttemptRecoveryRegistration(&analysisEntry, false, true) 909 if topologyRecovery == nil { 910 _ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another fixReplica.", analysisEntry.AnalyzedInstanceKey)) 911 return false, nil, err 912 } 913 log.Infof("Analysis: %v, will fix replica %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey) 914 // This has to be done in the end; whether successful or not, we should mark that the recovery is done. 915 // So that after the active period passes, we are able to run other recoveries. 916 defer func() { 917 _ = resolveRecovery(topologyRecovery, nil) 918 }() 919 920 analyzedTablet, err := inst.ReadTablet(analysisEntry.AnalyzedInstanceKey) 921 if err != nil { 922 return false, topologyRecovery, err 923 } 924 925 primaryTablet, err := shardPrimary(analyzedTablet.Keyspace, analyzedTablet.Shard) 926 if err != nil { 927 log.Info("Could not compute primary for %v/%v", analyzedTablet.Keyspace, analyzedTablet.Shard) 928 return false, topologyRecovery, err 929 } 930 931 durabilityPolicy, err := inst.GetDurabilityPolicy(analyzedTablet) 932 if err != nil { 933 log.Info("Could not read the durability policy for %v/%v", analyzedTablet.Keyspace, analyzedTablet.Shard) 934 return false, topologyRecovery, err 935 } 936 937 err = setReadOnly(ctx, analyzedTablet) 938 if err != nil { 939 log.Info("Could not set the tablet %v to readonly - %v", topoproto.TabletAliasString(analyzedTablet.Alias), err) 940 return true, topologyRecovery, err 941 } 942 943 err = setReplicationSource(ctx, analyzedTablet, primaryTablet, inst.IsReplicaSemiSync(durabilityPolicy, primaryTablet, analyzedTablet)) 944 return true, topologyRecovery, err 945 }