vitess.io/vitess@v0.16.2/go/vt/vtgr/controller/repair.go

vitess.io/vitess@v0.16.2/go/vt/vtgr/controller/repair.go (about)

     1  /*
     2  Copyright 2021 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package controller
    18  
    19  import (
    20  	"context"
    21  	"errors"
    22  	"fmt"
    23  	"sort"
    24  	"strconv"
    25  	"sync"
    26  	"time"
    27  
    28  	"github.com/spf13/pflag"
    29  
    30  	"vitess.io/vitess/go/mysql"
    31  	"vitess.io/vitess/go/stats"
    32  	"vitess.io/vitess/go/vt/concurrency"
    33  	"vitess.io/vitess/go/vt/servenv"
    34  	"vitess.io/vitess/go/vt/topo"
    35  	"vitess.io/vitess/go/vt/vterrors"
    36  	"vitess.io/vitess/go/vt/vtgr/db"
    37  
    38  	topodatapb "vitess.io/vitess/go/vt/proto/topodata"
    39  )
    40  
    41  var (
    42  	repairTimingsMs    = stats.NewMultiTimings("repairTimingsMs", "time vtgr takes to repair", []string{"status", "success"})
    43  	unexpectedLockLost = stats.NewCountersWithMultiLabels("unexpectedLockLost", "unexpected lost of the lock", []string{"Keyspace", "Shard"})
    44  
    45  	abortRebootstrap bool
    46  )
    47  
    48  func init() {
    49  	servenv.OnParseFor("vtgr", func(fs *pflag.FlagSet) {
    50  		fs.BoolVar(&abortRebootstrap, "abort_rebootstrap", false, "Don't allow vtgr to rebootstrap an existing group.")
    51  	})
    52  }
    53  
    54  // RepairResultCode is the code for repair
    55  type RepairResultCode string
    56  
    57  const (
    58  	// Success means successfully repaired
    59  	Success RepairResultCode = "Success"
    60  	// Fail means failed to repaire
    61  	Fail RepairResultCode = "Fail"
    62  	// Noop means do nothing
    63  	Noop RepairResultCode = "Noop"
    64  )
    65  
    66  // Repair tries to fix shard based on the diagnose type
    67  func (shard *GRShard) Repair(ctx context.Context, status DiagnoseType) (RepairResultCode, error) {
    68  	shard.Lock()
    69  	defer shard.Unlock()
    70  	var err error
    71  	code := Noop
    72  	switch status {
    73  	case DiagnoseTypeShardHasNoGroup:
    74  		code, err = shard.repairShardHasNoGroup(ctx)
    75  	case DiagnoseTypeShardHasInactiveGroup:
    76  		code, err = shard.repairShardHasInactiveGroup(ctx)
    77  	case DiagnoseTypeWrongPrimaryTablet:
    78  		code, err = shard.repairWrongPrimaryTablet(ctx)
    79  	case DiagnoseTypeUnconnectedReplica:
    80  		code, err = shard.repairUnconnectedReplica(ctx)
    81  	case DiagnoseTypeUnreachablePrimary:
    82  		code, err = shard.repairUnreachablePrimary(ctx)
    83  	case DiagnoseTypeInsufficientGroupSize:
    84  		code, err = shard.repairInsufficientGroupSize(ctx)
    85  	case DiagnoseTypeReadOnlyShard:
    86  		code, err = shard.repairReadOnlyShard(ctx)
    87  	case DiagnoseTypeBootstrapBackoff, DiagnoseTypeBackoffError:
    88  		code, err = shard.repairBackoffError(ctx, status)
    89  	case DiagnoseTypeError:
    90  		shard.logger.Errorf("%v is %v", formatKeyspaceShard(shard.KeyspaceShard), status)
    91  	case DiagnoseTypeHealthy:
    92  		start := time.Now()
    93  		repairTimingsMs.Record([]string{string(status), "true"}, start)
    94  	}
    95  	if status != DiagnoseTypeHealthy {
    96  		shard.logger.Infof("VTGR repaired %v status=%v | code=%v", formatKeyspaceShard(shard.KeyspaceShard), status, code)
    97  	}
    98  	return code, vterrors.Wrap(err, "vtgr repair")
    99  }
   100  
   101  func (shard *GRShard) repairShardHasNoGroup(ctx context.Context) (RepairResultCode, error) {
   102  	ctx, err := shard.LockShard(ctx, "repairShardHasNoGroup")
   103  	if err != nil {
   104  		shard.logger.Warningf("repairShardHasNoPrimaryTablet fails to grab lock for the shard %v: %v", shard.KeyspaceShard, err)
   105  		return Noop, err
   106  	}
   107  	defer shard.UnlockShard()
   108  	shard.refreshTabletsInShardLocked(ctx)
   109  	// Diagnose() will call shardAgreedGroup as the first thing
   110  	// which will update mysqlGroup stored in the shard
   111  	status, err := shard.diagnoseLocked(ctx)
   112  	if err != nil {
   113  		shard.logger.Errorf("Failed to diagnose: %v", err)
   114  		return Fail, err
   115  	}
   116  	if status != DiagnoseTypeShardHasNoGroup {
   117  		shard.logger.Infof("Shard %v is no longer in DiagnoseTypeShardHasNoGroup: %v", formatKeyspaceShard(shard.KeyspaceShard), status)
   118  		return Noop, nil
   119  	}
   120  	start := time.Now()
   121  	err = shard.repairShardHasNoGroupAction(ctx)
   122  	repairTimingsMs.Record([]string{DiagnoseTypeShardHasNoGroup, strconv.FormatBool(err == nil)}, start)
   123  	if err != nil {
   124  		return Fail, err
   125  	}
   126  	return Success, nil
   127  }
   128  
   129  func (shard *GRShard) repairShardHasNoGroupAction(ctx context.Context) error {
   130  	// If group is not empty AND there is at least one active group member
   131  	// we don't need to bootstrap. Instead we should try to join the group
   132  	mysqlGroup := shard.shardAgreedGroupName()
   133  	isAllOffline := shard.isAllOfflineOrError()
   134  	if mysqlGroup != "" {
   135  		shard.logger.Infof("Shard %v already have a group %v", formatKeyspaceShard(shard.KeyspaceShard), mysqlGroup)
   136  		return nil
   137  	}
   138  	// This should not really happen in reality
   139  	if mysqlGroup == "" && !isAllOffline {
   140  		return fmt.Errorf("shard %v has empty group name but some node is not OFFLINE", formatKeyspaceShard(shard.KeyspaceShard))
   141  	}
   142  
   143  	// Now we know group is null and there is no active node
   144  	// we should bootstrap the group
   145  	replicas := shard.instances
   146  	// Sanity check to make sure there is at least one instance
   147  	if len(replicas) == 0 {
   148  		shard.logger.Warningf("Cannot find any instance for the shard %v", formatKeyspaceShard(shard.KeyspaceShard))
   149  		return nil
   150  	}
   151  	if !shard.sqlGroup.IsSafeToBootstrap() {
   152  		return errors.New("unsafe to bootstrap group")
   153  	}
   154  	var candidate *grInstance
   155  	sort.SliceStable(replicas, func(i, j int) bool {
   156  		return replicas[i].alias < replicas[j].alias
   157  	})
   158  	for _, replica := range replicas {
   159  		if !shard.shardStatusCollector.isUnreachable(replica) {
   160  			candidate = replica
   161  			break
   162  		}
   163  	}
   164  	if candidate == nil {
   165  		return errors.New("fail to find any candidate to bootstrap")
   166  	}
   167  	// Bootstrap the group
   168  	shard.logger.Infof("Bootstrapping the group for %v on host=%v", formatKeyspaceShard(shard.KeyspaceShard), candidate.instanceKey.Hostname)
   169  	// Make sure we still hold the topo server lock before moving on
   170  	if err := shard.checkShardLocked(ctx); err != nil {
   171  		return err
   172  	}
   173  	if err := shard.dbAgent.BootstrapGroupLocked(candidate.instanceKey); err != nil {
   174  		// if bootstrap failed, the next one that gets the lock will try to do it again
   175  		shard.logger.Errorf("Failed to bootstrap mysql group on %v: %v", candidate.instanceKey.Hostname, err)
   176  		return err
   177  	}
   178  	shard.logger.Infof("Bootstrapped the group for %v", formatKeyspaceShard(shard.KeyspaceShard))
   179  	return nil
   180  }
   181  
   182  func (shard *GRShard) repairShardHasInactiveGroup(ctx context.Context) (RepairResultCode, error) {
   183  	ctx, err := shard.LockShard(ctx, "repairShardHasInactiveGroup")
   184  	if err != nil {
   185  		shard.logger.Warningf("repairShardHasInactiveGroup fails to grab lock for the shard %v: %v", shard.KeyspaceShard, err)
   186  		return Noop, err
   187  	}
   188  	defer shard.UnlockShard()
   189  	shard.refreshTabletsInShardLocked(ctx)
   190  	// Diagnose() will call shardAgreedGroup as the first thing
   191  	// which will update mysqlGroup stored in the shard
   192  	status, err := shard.diagnoseLocked(ctx)
   193  	if err != nil {
   194  		shard.logger.Errorf("Failed to diagnose: %v", err)
   195  		return Fail, err
   196  	}
   197  	if status != DiagnoseTypeShardHasInactiveGroup {
   198  		shard.logger.Infof("Shard %v is no longer in DiagnoseTypeShardHasInactiveGroup: %v", formatKeyspaceShard(shard.KeyspaceShard), status)
   199  		return Noop, nil
   200  	}
   201  	// Now we know the shard has an agreed group but no member in it
   202  	// We should find one with the largest GTID set as the
   203  	// new mysql primary to bootstrap the group
   204  	start := time.Now()
   205  	err = shard.stopAndRebootstrap(ctx)
   206  	repairTimingsMs.Record([]string{DiagnoseTypeShardHasInactiveGroup, strconv.FormatBool(err == nil)}, start)
   207  	if err != nil {
   208  		return Fail, err
   209  	}
   210  	return Success, nil
   211  }
   212  
   213  func (shard *GRShard) repairBackoffError(ctx context.Context, diagnose DiagnoseType) (RepairResultCode, error) {
   214  	ctx, err := shard.LockShard(ctx, "repairBackoffError")
   215  	if err != nil {
   216  		shard.logger.Warningf("repairBackoffError fails to grab lock for the shard %v: %v", shard.KeyspaceShard, err)
   217  		return Noop, err
   218  	}
   219  	defer shard.UnlockShard()
   220  	shard.refreshTabletsInShardLocked(ctx)
   221  	status, err := shard.diagnoseLocked(ctx)
   222  	if err != nil {
   223  		shard.logger.Errorf("Failed to diagnose: %v", err)
   224  		return Fail, err
   225  	}
   226  	if status != diagnose {
   227  		shard.logger.Infof("Shard %v is no longer in %v: %v", formatKeyspaceShard(shard.KeyspaceShard), diagnose, status)
   228  		return Noop, nil
   229  	}
   230  	if shard.lastDiagnoseResult != diagnose {
   231  		shard.logger.Infof("diagnose shard as %v but last diagnose result was %v", diagnose, shard.lastDiagnoseResult)
   232  		return Noop, nil
   233  	}
   234  	now := time.Now()
   235  	var waitTime time.Duration
   236  	switch diagnose {
   237  	case DiagnoseTypeBackoffError:
   238  		waitTime = shard.transientErrorWaitTime
   239  	case DiagnoseTypeBootstrapBackoff:
   240  		waitTime = shard.bootstrapWaitTime
   241  	default:
   242  		return Fail, fmt.Errorf("unsupported diagnose for repairBackoffError: %v", diagnose)
   243  	}
   244  	if now.Sub(shard.lastDiagnoseSince) < waitTime {
   245  		shard.logger.Infof("Detected %v at %v. In wait time for network partition", diagnose, shard.lastDiagnoseSince)
   246  		return Noop, nil
   247  	}
   248  	shard.logger.Infof("Detected %v at %v. Start repairing after %v", diagnose, shard.lastDiagnoseSince, shard.transientErrorWaitTime)
   249  	err = shard.stopAndRebootstrap(ctx)
   250  	repairTimingsMs.Record([]string{DiagnoseTypeBackoffError, strconv.FormatBool(err == nil)}, now)
   251  	if err != nil {
   252  		return Fail, err
   253  	}
   254  	return Success, nil
   255  }
   256  
   257  func (shard *GRShard) stopAndRebootstrap(ctx context.Context) error {
   258  	// Make sure we still hold the topo server lock before moving on
   259  	if err := shard.checkShardLocked(ctx); err != nil {
   260  		return err
   261  	}
   262  	// Before bootstrap the group, we need to stop group first
   263  	// abort aggressively here as soon as we encounter an error
   264  	// StopGroupLocked will check if instance is NOT in "ONLINE"/"RECOVERING" state (i.e., UNREACHABLE, ERROR or OFFLINE)
   265  	errorRecorder := shard.forAllInstances(func(instance *grInstance, wg *sync.WaitGroup, er concurrency.ErrorRecorder) {
   266  		defer wg.Done()
   267  		status := shard.sqlGroup.GetStatus(instance.instanceKey)
   268  		if status != nil && status.State == db.OFFLINE {
   269  			shard.logger.Infof("stop group replication on %v skipped because it is already OFFLINE", instance.alias)
   270  			return
   271  		}
   272  		shard.logger.Infof("stop group replication on %v", instance.alias)
   273  		err := shard.dbAgent.StopGroupLocked(instance.instanceKey)
   274  		if err != nil {
   275  			if !unreachableError(err) {
   276  				er.RecordError(err)
   277  			}
   278  			shard.logger.Warningf("Error during stop group replication on %v: %v", instance.instanceKey.Hostname, err)
   279  		}
   280  	})
   281  	// We don't check allowPartialUnhealthyNodes here because we don't record unreachableError here
   282  	// hence if errorRecorder has error, it indicates the mysqld is still reachable but there is nothing
   283  	// else went wrong.
   284  	if errorRecorder.HasErrors() {
   285  		shard.logger.Errorf("Failed to stop group replication %v", errorRecorder.Error())
   286  		return errorRecorder.Error()
   287  	}
   288  	shard.logger.Infof("Stop the group for %v", formatKeyspaceShard(shard.KeyspaceShard))
   289  	shard.logger.Info("Start find candidate to rebootstrap")
   290  	candidate, err := shard.findRebootstrapCandidate(ctx)
   291  	if err != nil {
   292  		shard.logger.Errorf("Failed to find rebootstrap candidate: %v", err)
   293  		return err
   294  	}
   295  	shard.refreshSQLGroup()
   296  	if !shard.sqlGroup.IsSafeToRebootstrap() {
   297  		return errors.New("unsafe to bootstrap group")
   298  	}
   299  	if abortRebootstrap {
   300  		shard.logger.Warningf("Abort stopAndRebootstrap because rebootstrap hook override")
   301  		return errForceAbortBootstrap
   302  	}
   303  	shard.logger.Infof("Rebootstrap %v on %v", formatKeyspaceShard(shard.KeyspaceShard), candidate.instanceKey.Hostname)
   304  	// Make sure we still hold the topo server lock before moving on
   305  	if err := shard.checkShardLocked(ctx); err != nil {
   306  		return err
   307  	}
   308  	uuid := shard.sqlGroup.GetGroupName()
   309  	if uuid == "" {
   310  		return errors.New("trying to rebootstrap without uuid")
   311  	}
   312  	return shard.dbAgent.RebootstrapGroupLocked(candidate.instanceKey, uuid)
   313  }
   314  
   315  // allowPartialUnhealthyNodes returns true if rebootstrapSize is set to non-zero
   316  // and the error we get is less than (total_num_tablet - rebootstrapSize)
   317  func (shard *GRShard) allowPartialUnhealthyNodes(errorRecorder *concurrency.AllErrorRecorder) bool {
   318  	if shard.sqlGroup.rebootstrapSize != 0 && len(shard.instances)-shard.sqlGroup.rebootstrapSize >= len(errorRecorder.GetErrors()) {
   319  		shard.logger.Warningf("Allow unhealthy nodes during the reboot group_size=%v, rebootstrap_config=%v, error=%v", shard.sqlGroup.expectedBootstrapSize, shard.sqlGroup.rebootstrapSize, len(errorRecorder.GetErrors()))
   320  		return true
   321  	}
   322  	return false
   323  }
   324  
   325  func (shard *GRShard) getGTIDSetFromAll(skipPrimary bool) (*groupGTIDRecorder, *concurrency.AllErrorRecorder, error) {
   326  	if len(shard.instances) == 0 {
   327  		return nil, nil, fmt.Errorf("%v has 0 instance", formatKeyspaceShard(shard.KeyspaceShard))
   328  	}
   329  	// Before we do failover, we first verify if there is no one agreed group name.
   330  	// If not, VTGR is not smart enough to figure out how to failover
   331  	// Note: the caller should make sure the mysqlGroup is refreshed after we grab a shard level lock
   332  	mysqlGroup := shard.shardAgreedGroupName()
   333  	if mysqlGroup == "" {
   334  		return nil, nil, fmt.Errorf("unable to find an agreed group name in %v", formatKeyspaceShard(shard.KeyspaceShard))
   335  	}
   336  	primary := shard.findShardPrimaryTablet()
   337  	var mysqlPrimaryHost string
   338  	var mysqlPrimaryPort int
   339  	// skipPrimary is true when we manual failover or if there is a unreachalbe primary tablet
   340  	// in both case, there should be a reconciled primary tablet
   341  	if skipPrimary && primary != nil {
   342  		status := shard.sqlGroup.GetStatus(primary.instanceKey)
   343  		mysqlPrimaryHost, mysqlPrimaryPort = status.HostName, status.Port
   344  		shard.logger.Infof("Found primary instance from MySQL on %v", mysqlPrimaryHost)
   345  	}
   346  	gtidRecorder := &groupGTIDRecorder{}
   347  	// Iterate through all the instances in the shard and find the one with largest GTID set with best effort
   348  	// We wrap it with forAllInstances so that the failover can continue if there is a host
   349  	// that is unreachable
   350  	errorRecorder := shard.forAllInstances(func(instance *grInstance, wg *sync.WaitGroup, er concurrency.ErrorRecorder) {
   351  		defer wg.Done()
   352  		if skipPrimary && instance.instanceKey.Hostname == mysqlPrimaryHost && instance.instanceKey.Port == mysqlPrimaryPort {
   353  			shard.logger.Infof("Skip %v to failover to a non-primary node", mysqlPrimaryHost)
   354  			return
   355  		}
   356  		gtids, err := shard.dbAgent.FetchApplierGTIDSet(instance.instanceKey)
   357  		if err != nil {
   358  			er.RecordError(err)
   359  			shard.logger.Errorf("%v get error while fetch applier GTIDs: %v", instance.alias, err)
   360  			shard.shardStatusCollector.recordProblematics(instance)
   361  			if unreachableError(err) {
   362  				shard.shardStatusCollector.recordUnreachables(instance)
   363  			}
   364  			return
   365  		}
   366  		if gtids == nil {
   367  			shard.logger.Warningf("[failover candidate] skip %s with empty gtid", instance.alias)
   368  			return
   369  		}
   370  		gtidRecorder.recordGroupGTIDs(gtids, instance)
   371  	})
   372  	return gtidRecorder, errorRecorder, nil
   373  }
   374  
   375  func (shard *GRShard) findRebootstrapCandidate(ctx context.Context) (*grInstance, error) {
   376  	gtidRecorder, errorRecorder, err := shard.getGTIDSetFromAll(false)
   377  	if err != nil {
   378  		shard.logger.Errorf("Failed to get gtid from all: %v", err)
   379  		return nil, err
   380  	}
   381  	err = errorRecorder.Error()
   382  	// We cannot tolerate any error from mysql during a rebootstrap.
   383  	if err != nil && !shard.allowPartialUnhealthyNodes(errorRecorder) {
   384  		shard.logger.Errorf("Failed to fetch all GTID with forAllInstances for rebootstrap: %v", err)
   385  		return nil, err
   386  	}
   387  	candidate, err := shard.findFailoverCandidateFromRecorder(ctx, gtidRecorder, nil)
   388  	if err != nil {
   389  		shard.logger.Errorf("Failed to find rebootstrap candidate by GTID after forAllInstances: %v", err)
   390  		return nil, err
   391  	}
   392  	if candidate == nil {
   393  		return nil, fmt.Errorf("failed to find rebootstrap candidate for %v", formatKeyspaceShard(shard.KeyspaceShard))
   394  	}
   395  	if !shard.instanceReachable(ctx, candidate) {
   396  		shard.logger.Errorf("rebootstrap candidate %v (%v) is not reachable via ping", candidate.alias, candidate.instanceKey.Hostname)
   397  		return nil, fmt.Errorf("%v is unreachable", candidate.alias)
   398  	}
   399  	shard.logger.Infof("%v is the rebootstrap candidate", candidate.alias)
   400  	return candidate, nil
   401  }
   402  
   403  // Caller of this function should make sure it gets the shard lock and it has the
   404  // latest view of a shard. Otherwise, we might skip the wrong node when we locate the candidate
   405  func (shard *GRShard) findFailoverCandidate(ctx context.Context) (*grInstance, error) {
   406  	gtidRecorder, errorRecorder, err := shard.getGTIDSetFromAll(true)
   407  	if err != nil {
   408  		shard.logger.Errorf("Failed to get gtid from all: %v", err)
   409  		return nil, err
   410  	}
   411  	err = errorRecorder.Error()
   412  	// During the repair for unreachable primary we still have a mysql group.
   413  	// Failover within the group is safe, finding the largest GTID is an optimization.
   414  	// therefore we don't check error from errorRecorder just log it
   415  	if err != nil {
   416  		shard.logger.Warningf("Errors when fetch all GTID with forAllInstances for failover: %v", err)
   417  	}
   418  	shard.forAllInstances(func(instance *grInstance, wg *sync.WaitGroup, er concurrency.ErrorRecorder) {
   419  		defer wg.Done()
   420  		if !shard.instanceReachable(ctx, instance) {
   421  			shard.logger.Errorf("%v is not reachable via ping", instance.alias)
   422  			shard.shardStatusCollector.recordProblematics(instance)
   423  			shard.shardStatusCollector.recordUnreachables(instance)
   424  		}
   425  	})
   426  	var candidate *grInstance
   427  	candidate, err = shard.findFailoverCandidateFromRecorder(ctx, gtidRecorder, func(c context.Context, instance *grInstance) bool {
   428  		return !shard.shardStatusCollector.isUnreachable(instance)
   429  	})
   430  	if err != nil {
   431  		shard.logger.Errorf("Failed to find failover candidate by GTID after forAllInstances: %v", err)
   432  		return nil, err
   433  	}
   434  	if candidate == nil {
   435  		return nil, fmt.Errorf("failed to find failover candidate for %v", formatKeyspaceShard(shard.KeyspaceShard))
   436  	}
   437  	shard.logger.Infof("%v is the failover candidate", candidate.alias)
   438  	return candidate, nil
   439  }
   440  
   441  func (shard *GRShard) repairWrongPrimaryTablet(ctx context.Context) (RepairResultCode, error) {
   442  	ctx, err := shard.LockShard(ctx, "repairWrongPrimaryTablet")
   443  	if err != nil {
   444  		shard.logger.Warningf("repairWrongPrimaryTablet fails to grab lock for the shard %v: %v", shard.KeyspaceShard, err)
   445  		return Noop, err
   446  	}
   447  	defer shard.UnlockShard()
   448  	// We grab shard level lock and check again if there is no primary
   449  	// to avoid race conditions
   450  	shard.refreshTabletsInShardLocked(ctx)
   451  	status, err := shard.diagnoseLocked(ctx)
   452  	if err != nil {
   453  		shard.logger.Errorf("Failed to diagnose: %v", err)
   454  		return Fail, err
   455  	}
   456  	if status != DiagnoseTypeWrongPrimaryTablet {
   457  		shard.logger.Infof("Shard %v is no longer in DiagnoseTypeWrongPrimaryTablet: %v", formatKeyspaceShard(shard.KeyspaceShard), status)
   458  		return Noop, nil
   459  	}
   460  	start := time.Now()
   461  	err = shard.fixPrimaryTabletLocked(ctx)
   462  	repairTimingsMs.Record([]string{DiagnoseTypeWrongPrimaryTablet, strconv.FormatBool(err == nil)}, start)
   463  	if err != nil {
   464  		return Fail, err
   465  	}
   466  	return Success, nil
   467  }
   468  
   469  // fixPrimaryTabletLocked changes Vitess primary tablet based on mysql group
   470  func (shard *GRShard) fixPrimaryTabletLocked(ctx context.Context) error {
   471  	host, port, isActive := shard.sqlGroup.GetPrimary()
   472  	if !isActive {
   473  		return db.ErrGroupInactive
   474  	}
   475  	// Primary tablet does not run mysql primary, we need to change it accordingly
   476  	candidate := shard.findTabletByHostAndPort(host, port)
   477  	if candidate == nil {
   478  		return errMissingPrimaryTablet
   479  	}
   480  	// Make sure we still hold the topo server lock before moving on
   481  	if err := shard.checkShardLocked(ctx); err != nil {
   482  		return err
   483  	}
   484  	err := shard.tmc.ChangeType(ctx, candidate.tablet, topodatapb.TabletType_PRIMARY, false)
   485  	if err != nil {
   486  		return fmt.Errorf("failed to change type to primary on %v: %v", candidate.alias, err)
   487  	}
   488  	shard.logger.Infof("Successfully make %v the primary tablet", candidate.alias)
   489  	return nil
   490  }
   491  
   492  // repairUnconnectedReplica usually handle the case when there is a DiagnoseTypeHealthy tablet and
   493  // it is not connected to mysql primary node
   494  func (shard *GRShard) repairUnconnectedReplica(ctx context.Context) (RepairResultCode, error) {
   495  	ctx, err := shard.LockShard(ctx, "repairUnconnectedReplica")
   496  	if err != nil {
   497  		shard.logger.Warningf("repairUnconnectedReplica fails to grab lock for the shard %v: %v", formatKeyspaceShard(shard.KeyspaceShard), err)
   498  		return Noop, err
   499  	}
   500  	defer shard.UnlockShard()
   501  	shard.refreshTabletsInShardLocked(ctx)
   502  	status, err := shard.diagnoseLocked(ctx)
   503  	if err != nil {
   504  		shard.logger.Errorf("Failed to diagnose: %v", err)
   505  		return Fail, err
   506  	}
   507  	if status != DiagnoseTypeUnconnectedReplica {
   508  		shard.logger.Infof("Shard %v is no longer in DiagnoseTypeUnconnectedReplica: %v", formatKeyspaceShard(shard.KeyspaceShard), status)
   509  		return Noop, nil
   510  	}
   511  	start := time.Now()
   512  	err = shard.repairUnconnectedReplicaAction(ctx)
   513  	repairTimingsMs.Record([]string{DiagnoseTypeUnconnectedReplica, strconv.FormatBool(err == nil)}, start)
   514  	if err != nil {
   515  		return Fail, err
   516  	}
   517  	return Success, nil
   518  }
   519  
   520  func (shard *GRShard) repairUnconnectedReplicaAction(ctx context.Context) error {
   521  	primaryInstance := shard.findShardPrimaryTablet()
   522  	target, err := shard.disconnectedInstance()
   523  	if err != nil {
   524  		return err
   525  	}
   526  	if target == nil {
   527  		shard.logger.Infof("there is no instance without group for %v", formatKeyspaceShard(shard.KeyspaceShard))
   528  		return nil
   529  	}
   530  	shard.logger.Infof("Connecting replica %v to %v", target.instanceKey.Hostname, primaryInstance.instanceKey.Hostname)
   531  	status := shard.sqlGroup.GetStatus(target.instanceKey)
   532  	// Make sure we still hold the topo server lock before moving on
   533  	if err := shard.checkShardLocked(ctx); err != nil {
   534  		return err
   535  	}
   536  	if status != nil && status.State != db.OFFLINE {
   537  		shard.logger.Infof("stop group replication on %v (%v) before join the group", target.alias, status.State)
   538  		err := shard.dbAgent.StopGroupLocked(target.instanceKey)
   539  		if err != nil {
   540  			shard.logger.Errorf("Failed to stop group replication on %v: %v", target.instanceKey.Hostname, err)
   541  			return err
   542  		}
   543  		// Make sure we still hold the topo server lock before moving on
   544  		if err := shard.checkShardLocked(ctx); err != nil {
   545  			return err
   546  		}
   547  	}
   548  	return shard.dbAgent.JoinGroupLocked(target.instanceKey, primaryInstance.instanceKey)
   549  }
   550  
   551  func (shard *GRShard) repairUnreachablePrimary(ctx context.Context) (RepairResultCode, error) {
   552  	ctx, err := shard.LockShard(ctx, "repairUnreachablePrimary")
   553  	if err != nil {
   554  		shard.logger.Warningf("repairUnreachablePrimary fails to grab lock for the shard %v: %v", formatKeyspaceShard(shard.KeyspaceShard), err)
   555  		return Noop, err
   556  	}
   557  	defer shard.UnlockShard()
   558  	shard.refreshTabletsInShardLocked(ctx)
   559  	status, err := shard.diagnoseLocked(ctx)
   560  	if err != nil {
   561  		shard.logger.Errorf("Failed to diagnose: %v", err)
   562  		return Fail, err
   563  	}
   564  	if status != DiagnoseTypeUnreachablePrimary {
   565  		shard.logger.Infof("Shard %v is no longer in DiagnoseTypeUnreachablePrimary: %v", formatKeyspaceShard(shard.KeyspaceShard), status)
   566  		return Noop, nil
   567  	}
   568  	// We are here because either:
   569  	// 1. we have a primary tablet, but it's not reachable
   570  	// 2. we cannot find primary tablet but we do have a mysql group
   571  	// we need to failover mysql manually
   572  	//
   573  	// other case will be handled by different testGroupInput, e.g.,
   574  	// has reachable primary tablet, but run on different node than mysql -> DiagnoseTypeWrongPrimaryTablet
   575  	start := time.Now()
   576  	err = shard.failoverLocked(ctx)
   577  	repairTimingsMs.Record([]string{DiagnoseTypeUnreachablePrimary, strconv.FormatBool(err == nil)}, start)
   578  	if err != nil {
   579  		return Fail, err
   580  	}
   581  	return Success, nil
   582  }
   583  
   584  func (shard *GRShard) repairInsufficientGroupSize(ctx context.Context) (RepairResultCode, error) {
   585  	ctx, err := shard.LockShard(ctx, "repairInsufficientGroupSize")
   586  	if err != nil {
   587  		shard.logger.Warningf("repairInsufficientGroupSize fails to grab lock for the shard %v: %v", formatKeyspaceShard(shard.KeyspaceShard), err)
   588  		return Noop, err
   589  	}
   590  	defer shard.UnlockShard()
   591  	shard.refreshTabletsInShardLocked(ctx)
   592  	status, err := shard.diagnoseLocked(ctx)
   593  	if err != nil {
   594  		shard.logger.Errorf("Failed to diagnose: %v", err)
   595  		return Fail, err
   596  	}
   597  	if status != DiagnoseTypeInsufficientGroupSize {
   598  		shard.logger.Infof("Shard %v is no longer in DiagnoseTypeInsufficientGroupSize: %v", formatKeyspaceShard(shard.KeyspaceShard), status)
   599  		return Noop, nil
   600  	}
   601  	// We check primary tablet is consistent with sql primary before InsufficientGroupSize
   602  	// therefore primary we found here is correct and healthy
   603  	primary := shard.findShardPrimaryTablet()
   604  	// Make sure we still hold the topo server lock before moving on
   605  	if err := shard.checkShardLocked(ctx); err != nil {
   606  		return Fail, err
   607  	}
   608  	// mysql group will set super_read_only properly automatically
   609  	// https://mysqlhighavailability.com/protecting-your-data-fail-safe-enhancements-to-group-replication/
   610  	// since Vitess only knows one writable node (primary tablet) if we want to make sure there is no write
   611  	// after there is insufficient members, we can just set primary mysql node to be read only
   612  	err = shard.dbAgent.SetReadOnly(primary.instanceKey, true)
   613  	if err != nil {
   614  		return Fail, err
   615  	}
   616  	return Success, nil
   617  }
   618  
   619  func (shard *GRShard) repairReadOnlyShard(ctx context.Context) (RepairResultCode, error) {
   620  	ctx, err := shard.LockShard(ctx, "repairReadOnlyShard")
   621  	if err != nil {
   622  		shard.logger.Warningf("repairReadOnlyShard fails to grab lock for the shard %v: %v", formatKeyspaceShard(shard.KeyspaceShard), err)
   623  		return Noop, err
   624  	}
   625  	defer shard.UnlockShard()
   626  	shard.refreshTabletsInShardLocked(ctx)
   627  	status, err := shard.diagnoseLocked(ctx)
   628  	if err != nil {
   629  		shard.logger.Errorf("Failed to diagnose: %v", err)
   630  		return Fail, err
   631  	}
   632  	if status != DiagnoseTypeReadOnlyShard {
   633  		shard.logger.Infof("Shard %v is no longer in DiagnoseTypeReadOnlyShard: %v", formatKeyspaceShard(shard.KeyspaceShard), status)
   634  		return Noop, nil
   635  	}
   636  	primary := shard.findShardPrimaryTablet()
   637  	// Make sure we still hold the topo server lock before moving on
   638  	if err := shard.checkShardLocked(ctx); err != nil {
   639  		return Fail, err
   640  	}
   641  	// undo what we did repairInsufficientGroupSize
   642  	err = shard.dbAgent.SetReadOnly(primary.instanceKey, false)
   643  	if err != nil {
   644  		return Fail, err
   645  	}
   646  	return Success, nil
   647  }
   648  
   649  // Failover takes a shard and find an node with largest GTID as the mysql primary of the group
   650  func (shard *GRShard) Failover(ctx context.Context) error {
   651  	ctx, err := shard.LockShard(ctx, "Failover")
   652  	if err != nil {
   653  		shard.logger.Warningf("Failover fails to grab lock for the shard %v: %v", formatKeyspaceShard(shard.KeyspaceShard), err)
   654  		return err
   655  	}
   656  	defer shard.UnlockShard()
   657  	shard.refreshTabletsInShardLocked(ctx)
   658  	return shard.failoverLocked(ctx)
   659  }
   660  
   661  func (shard *GRShard) failoverLocked(ctx context.Context) error {
   662  	candidate, err := shard.findFailoverCandidate(ctx)
   663  	if err != nil {
   664  		shard.logger.Errorf("Failed to find failover candidate: %v", err)
   665  		return err
   666  	}
   667  	// Make sure we still hold the topo server lock before moving on
   668  	if err := shard.checkShardLocked(ctx); err != nil {
   669  		return err
   670  	}
   671  	err = shard.dbAgent.Failover(candidate.instanceKey)
   672  	if err != nil {
   673  		shard.logger.Errorf("Failed to failover mysql to %v", candidate.alias)
   674  		return err
   675  	}
   676  	shard.logger.Infof("Successfully failover MySQL to %v for %v", candidate.instanceKey.Hostname, formatKeyspaceShard(shard.KeyspaceShard))
   677  	if !shard.isActive.Get() {
   678  		shard.logger.Infof("Skip vttablet failover on an inactive shard %v", formatKeyspaceShard(shard.KeyspaceShard))
   679  		return nil
   680  	}
   681  	// Make sure we still hold the topo server lock before moving on
   682  	if err := shard.checkShardLocked(ctx); err != nil {
   683  		return err
   684  	}
   685  	err = shard.tmc.ChangeType(ctx, candidate.tablet, topodatapb.TabletType_PRIMARY, false)
   686  	if err != nil {
   687  		shard.logger.Errorf("Failed to failover Vitess %v", candidate.alias)
   688  		return err
   689  	}
   690  	shard.logger.Infof("Successfully failover Vitess to %v for %v", candidate.alias, formatKeyspaceShard(shard.KeyspaceShard))
   691  	return nil
   692  }
   693  
   694  func (shard *GRShard) findFailoverCandidateFromRecorder(ctx context.Context, recorder *groupGTIDRecorder, check func(context.Context, *grInstance) bool) (*grInstance, error) {
   695  	if len(recorder.gtidWithInstances) == 0 {
   696  		return nil, fmt.Errorf("empty failover candidate list for %v", formatKeyspaceShard(shard.KeyspaceShard))
   697  	}
   698  	// Sort the gtidWithInstances slice so that we have consistent candidate
   699  	// in case they have same gtid set
   700  	recorder.sort()
   701  	for _, gtidInst := range recorder.gtidWithInstances {
   702  		shard.logger.Infof("[failover candidates] %s gtid %s", gtidInst.instance.alias, gtidInst.gtids.String())
   703  	}
   704  	var largestGTIDs mysql.GTIDSet
   705  	var candidate *grInstance
   706  	var divergentCandidates []string
   707  	// All the instances in the recorder have a reachable mysqld
   708  	// hence anyone is a valid failover candidate
   709  	for _, elem := range recorder.gtidWithInstances {
   710  		gtids := elem.gtids
   711  		inst := elem.instance
   712  		if check != nil && !check(ctx, inst) {
   713  			shard.logger.Warningf("Skip %v as candidate with gtid %v because it failed the check", inst.alias, gtids.String())
   714  			continue
   715  		}
   716  		if largestGTIDs == nil {
   717  			largestGTIDs = gtids
   718  			candidate = inst
   719  			continue
   720  		}
   721  		// If largestGTIDs is subset of current gtids, it means instance has larger GTID than candidate
   722  		// we need to swap them out
   723  		isSubset, isSuperset := compareGTIDSet(largestGTIDs, gtids)
   724  		if isSubset {
   725  			largestGTIDs = gtids
   726  			candidate = inst
   727  			continue
   728  		}
   729  		// largestGTIDs is neither subset nor super set of gtids
   730  		// we log and append to candidates so that we know there is a problem in the group
   731  		// after the iteration
   732  		if !isSuperset {
   733  			shard.logger.Errorf("FetchGroupView divergent GITD set from host=%v GTIDSet=%v", inst.instanceKey.Hostname, gtids)
   734  			divergentCandidates = append(divergentCandidates, inst.alias)
   735  		}
   736  	}
   737  	// unless GTID set diverged, the candidates should be empty
   738  	if len(divergentCandidates) > 0 {
   739  		divergentCandidates = append(divergentCandidates, candidate.alias)
   740  		return nil, fmt.Errorf("found more than one failover candidates by GTID set for %v: %v", formatKeyspaceShard(shard.KeyspaceShard), divergentCandidates)
   741  	}
   742  	return candidate, nil
   743  }
   744  
   745  func compareGTIDSet(set1, set2 mysql.GTIDSet) (bool, bool) {
   746  	isSubset := set2.Contains(set1)
   747  	// If set1 is subset of set2 we find a GTID super set and just need to record it
   748  	if isSubset {
   749  		return true, false
   750  	}
   751  	// If set1 is not a subset of set2 we need to see if set1 is actually a super set of set2
   752  	// this is to controller GTID set divergence
   753  	isSubset = set1.Contains(set2)
   754  	// We know set1 is not subset of set2 if set2 is also not subset of set1, it means
   755  	// there is a divergent in GTID sets
   756  	return false, isSubset
   757  }
   758  
   759  func (shard *GRShard) checkShardLocked(ctx context.Context) error {
   760  	if err := topo.CheckShardLocked(ctx, shard.KeyspaceShard.Keyspace, shard.KeyspaceShard.Shard); err != nil {
   761  		labels := []string{shard.KeyspaceShard.Keyspace, shard.KeyspaceShard.Shard}
   762  		unexpectedLockLost.Add(labels, 1)
   763  		shard.logger.Errorf("lost topology lock; aborting")
   764  		return vterrors.Wrap(err, "lost topology lock; aborting")
   765  	}
   766  	return nil
   767  }