vitess.io/vitess@v0.16.2/go/vt/vtgr/controller/diagnose.go (about)

     1  /*
     2  Copyright 2021 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package controller
    18  
    19  import (
    20  	"context"
    21  	"errors"
    22  	"fmt"
    23  	"math/rand"
    24  	"os"
    25  	"sort"
    26  	"strings"
    27  	"sync"
    28  	"time"
    29  
    30  	"github.com/spf13/pflag"
    31  
    32  	"vitess.io/vitess/go/mysql"
    33  	"vitess.io/vitess/go/vt/concurrency"
    34  	"vitess.io/vitess/go/vt/servenv"
    35  	"vitess.io/vitess/go/vt/topo"
    36  	"vitess.io/vitess/go/vt/vterrors"
    37  	"vitess.io/vitess/go/vt/vtgr/db"
    38  )
    39  
    40  var pingTabletTimeout = 2 * time.Second
    41  
    42  func init() {
    43  	servenv.OnParseFor("vtgr", func(fs *pflag.FlagSet) {
    44  		fs.DurationVar(&pingTabletTimeout, "ping_tablet_timeout", 2*time.Second, "time to wait when we ping a tablet")
    45  	})
    46  }
    47  
    48  // DiagnoseType is the types of Diagnose result
    49  type DiagnoseType string
    50  
    51  type instanceGTIDSet struct {
    52  	gtids    mysql.GTIDSet
    53  	instance *grInstance
    54  }
    55  
    56  // groupGTIDRecorder is used to help us query all the instance in parallel and record the result
    57  // it helps us to take care of the consistency / synchronization among go routines
    58  type groupGTIDRecorder struct {
    59  	name              string
    60  	gtidWithInstances []*instanceGTIDSet
    61  	hasActive         bool
    62  	sync.Mutex
    63  }
    64  
    65  const (
    66  	// DiagnoseTypeError represents an DiagnoseTypeError status
    67  	DiagnoseTypeError DiagnoseType = "error"
    68  	// DiagnoseTypeHealthy represents everything is DiagnoseTypeHealthy
    69  	DiagnoseTypeHealthy = "Healthy"
    70  	// DiagnoseTypeShardHasNoGroup represents the cluster has not init yet
    71  	DiagnoseTypeShardHasNoGroup = "ShardHasNoGroup"
    72  	// DiagnoseTypeShardHasInactiveGroup represents the status where we have a group name but no member in it
    73  	DiagnoseTypeShardHasInactiveGroup = "ShardHasInactiveGroup"
    74  	// DiagnoseTypeInsufficientGroupSize represents the cluster has insufficient group members
    75  	DiagnoseTypeInsufficientGroupSize = "InsufficientGroupSize"
    76  	// DiagnoseTypeReadOnlyShard represents the cluster who has a read only node
    77  	DiagnoseTypeReadOnlyShard = "ReadOnlyShard"
    78  	// DiagnoseTypeUnreachablePrimary represents the primary tablet is unreachable
    79  	DiagnoseTypeUnreachablePrimary = "UnreachablePrimary"
    80  	// DiagnoseTypeWrongPrimaryTablet represents the primary tablet is incorrect based on mysql group
    81  	DiagnoseTypeWrongPrimaryTablet = "WrongPrimaryTablet"
    82  	// DiagnoseTypeUnconnectedReplica represents cluster with primary tablet, but a node is not connected to it
    83  	DiagnoseTypeUnconnectedReplica = "UnconnectedReplica"
    84  	// DiagnoseTypeBackoffError represents a transient error e.g., the primary is unreachable
    85  	DiagnoseTypeBackoffError = "BackoffError"
    86  	// DiagnoseTypeBootstrapBackoff represents an ongoing bootstrap
    87  	DiagnoseTypeBootstrapBackoff = "BootstrapBackoff"
    88  
    89  	// diagnoseTypeUnknown represents a unclear intermediate diagnose state
    90  	diagnoseTypeUnknown = "Unknown"
    91  )
    92  
    93  // ScanAndRepairShard scans a particular shard by first Diagnose the shard with info from grShard
    94  // and then repair the probelm if the shard is unhealthy
    95  func (shard *GRShard) ScanAndRepairShard(ctx context.Context) {
    96  	status, err := shard.Diagnose(ctx)
    97  	if err != nil {
    98  		shard.logger.Errorf("fail to scanAndRepairShard %v/%v because of Diagnose error: %v", shard.KeyspaceShard.Keyspace, shard.KeyspaceShard.Shard, err)
    99  		return
   100  	}
   101  	// We are able to get Diagnose without error
   102  	//
   103  	// Note: all the recovery function should first try to grab a shard level lock
   104  	// and check the trigger conditions before doing anything. This is to avoid
   105  	// other VTGR instance try to do the same thing
   106  	shard.logger.Infof("%v status is %v", formatKeyspaceShard(shard.KeyspaceShard), status)
   107  	if _, err := shard.Repair(ctx, status); err != nil {
   108  		shard.logger.Errorf("failed to repair %v: %v", status, err)
   109  	}
   110  }
   111  
   112  // Diagnose the shard in the following order:
   113  // TODO: use FSM to make sure the status transition is correct
   114  // 1. if the shard has a group that every node agreed on
   115  // 2. if the group has any active (online / recovering) member
   116  // 3. if the shard has initialized a Vitess primary
   117  // 4. if primary tablet is reachable
   118  // 5. if Vitess primary and mysql primary reconciled
   119  // 6. if we have enough group members
   120  // 7. if the primary node has read_only=OFF
   121  // 8. if there is a node that is not in Mysql group
   122  func (shard *GRShard) Diagnose(ctx context.Context) (DiagnoseType, error) {
   123  	shard.Lock()
   124  	defer shard.Unlock()
   125  	diagnoseResult, err := shard.diagnoseLocked(ctx)
   126  	shard.shardStatusCollector.recordDiagnoseResult(diagnoseResult)
   127  	shard.populateVTGRStatusLocked()
   128  	if diagnoseResult != DiagnoseTypeHealthy {
   129  		shard.logger.Warningf(`VTGR diagnose shard as unhealthy for %s/%s: result=%v, last_result=%v, instances=%v, primary=%v, primary_tablet=%v, problematics=%v, unreachables=%v,\n%v`,
   130  			shard.KeyspaceShard.Keyspace, shard.KeyspaceShard.Shard,
   131  			shard.shardStatusCollector.status.DiagnoseResult,
   132  			shard.lastDiagnoseResult,
   133  			shard.shardStatusCollector.status.Instances,
   134  			shard.shardStatusCollector.status.Primary,
   135  			shard.primaryTabletAlias(),
   136  			shard.shardStatusCollector.status.Problematics,
   137  			shard.shardStatusCollector.status.Unreachables,
   138  			shard.sqlGroup.ToString())
   139  	}
   140  	if diagnoseResult != shard.lastDiagnoseResult {
   141  		shard.lastDiagnoseResult = diagnoseResult
   142  		shard.lastDiagnoseSince = time.Now()
   143  	}
   144  	return diagnoseResult, err
   145  }
   146  
   147  func (shard *GRShard) diagnoseLocked(ctx context.Context) (DiagnoseType, error) {
   148  	// fast path only diagnose problem Vitess primary
   149  	// which does not needed if the shard is inactive
   150  	if shard.localDbPort != 0 && shard.isActive.Get() {
   151  		localView := shard.getLocalView()
   152  		if localView != nil {
   153  			fastDiagnose := shard.fastPathDiagnose(ctx, localView)
   154  			if fastDiagnose != diagnoseTypeUnknown {
   155  				// If we can use local sql group info to diagnose
   156  				// we should record the view as well. This view is all we need
   157  				// later VTGR needs to find group name, primary etc from
   158  				// SQLGroup for repairing instead of getting nil
   159  				shard.sqlGroup.overrideView([]*db.GroupView{localView})
   160  				shard.logger.Infof("Diagnose %v from fast path", fastDiagnose)
   161  				return fastDiagnose, nil
   162  			}
   163  		}
   164  	}
   165  	// fast path is disabled or cannot diagnose the shard
   166  	// fall back to the normal strategy where we fetch info from all the nodes
   167  	err := shard.refreshSQLGroup()
   168  	if err != nil {
   169  		if errors.Is(err, db.ErrGroupBackoffError) {
   170  			return DiagnoseTypeBackoffError, nil
   171  		}
   172  		if errors.Is(err, db.ErrGroupOngoingBootstrap) {
   173  			return DiagnoseTypeBootstrapBackoff, nil
   174  		}
   175  		return DiagnoseTypeError, vterrors.Wrap(err, "fail to refreshSQLGroup")
   176  	}
   177  	// First, we check if there is any group in the shard
   178  	// if no, we should bootstrap one
   179  	mysqlGroup := shard.shardAgreedGroupName()
   180  	if mysqlGroup == "" {
   181  		if len(shard.sqlGroup.views) != shard.sqlGroup.expectedBootstrapSize {
   182  			return DiagnoseTypeError, fmt.Errorf("fail to diagnose ShardHasNoGroup with %v nodes", len(shard.sqlGroup.views))
   183  		}
   184  		return DiagnoseTypeShardHasNoGroup, nil
   185  	}
   186  	// We handle the case where the shard has an agreed group name but all nodes are offline
   187  	// In this situation, instead of bootstrap a group, we should re-build the
   188  	// old group for the shard
   189  	if shard.isAllOfflineOrError() {
   190  		shard.logger.Info("Found all members are OFFLINE or ERROR")
   191  		// On rebootstrap, we always want to make sure _all_ the nodes in topo are reachable
   192  		// unless we override the rebootstrap size
   193  		desiredRebootstrapSize := len(shard.instances)
   194  		if shard.sqlGroup.rebootstrapSize != 0 {
   195  			desiredRebootstrapSize = shard.sqlGroup.rebootstrapSize
   196  		}
   197  		if len(shard.sqlGroup.views) != desiredRebootstrapSize {
   198  			return DiagnoseTypeError, fmt.Errorf("fail to diagnose ShardHasInactiveGroup with %v nodes expecting %v", len(shard.sqlGroup.views), desiredRebootstrapSize)
   199  		}
   200  		return DiagnoseTypeShardHasInactiveGroup, nil
   201  	}
   202  
   203  	// We only check Vitess primary iff shard is active.
   204  	// Otherwise VTGR will only make sure there is a mysql group in the shard.
   205  	if shard.isActive.Get() {
   206  		// Secondly, we check if there is a primary tablet.
   207  		// If there is a group but we cannot find a primary tablet
   208  		// we should set it based on mysql group
   209  		hasWrongPrimary, err := shard.hasWrongPrimaryTablet(ctx)
   210  		if err != nil {
   211  			// errMissingGroup means we cannot find a mysql group for the shard
   212  			// we are in DiagnoseTypeShardHasNoGroup state
   213  			if err == errMissingGroup {
   214  				shard.logger.Warning("Missing mysql group")
   215  				return DiagnoseTypeShardHasNoGroup, nil
   216  			}
   217  			// errMissingPrimaryTablet means we cannot find a tablet based on mysql primary
   218  			// which means the tablet disconnected from topo server and we cannot find it
   219  			if err == errMissingPrimaryTablet {
   220  				return DiagnoseTypeUnreachablePrimary, nil
   221  			}
   222  			return DiagnoseTypeError, vterrors.Wrap(err, "fail to diagnose shardNeedsInitialized")
   223  		}
   224  		if hasWrongPrimary {
   225  			return DiagnoseTypeWrongPrimaryTablet, nil
   226  		}
   227  
   228  		// Thirdly, we check if primary tablet is reachable
   229  		isPrimaryReachable, err := shard.isPrimaryReachable(ctx)
   230  		if err != nil {
   231  			return DiagnoseTypeError, vterrors.Wrap(err, "fail to diagnose isPrimaryReachable")
   232  		}
   233  		if !isPrimaryReachable {
   234  			return DiagnoseTypeUnreachablePrimary, nil
   235  		}
   236  	}
   237  
   238  	// At this point, the primary tablet should be consistent with mysql primary
   239  	// so the view from priamry tablet should be accurate
   240  	onlineMembers, isReadOnly := shard.getOnlineGroupInfo()
   241  	// If we found a writable shard in the inactive shard
   242  	// we should consider the shard as InsufficientGroupSize to set read only
   243  	if !isReadOnly && !shard.isActive.Get() {
   244  		return DiagnoseTypeInsufficientGroupSize, nil
   245  	}
   246  	// Then we check if we satisfy the minimum replica requirement
   247  	if shard.minNumReplicas > 0 {
   248  		if onlineMembers >= shard.minNumReplicas && isReadOnly && shard.isActive.Get() {
   249  			return DiagnoseTypeReadOnlyShard, nil
   250  		}
   251  		// If we disable readonly protection and still found we have a read only shard,
   252  		// we should return DiagnoseTypeReadOnlyShard so that VTGR can turn off read only
   253  		if shard.disableReadOnlyProtection && isReadOnly && shard.isActive.Get() {
   254  			return DiagnoseTypeReadOnlyShard, nil
   255  		}
   256  		// We don't check isActive here since if it is inactive, VTGR should already return InsufficientGroupSize
   257  		if !shard.disableReadOnlyProtection && onlineMembers < shard.minNumReplicas && !isReadOnly {
   258  			return DiagnoseTypeInsufficientGroupSize, nil
   259  		}
   260  	}
   261  
   262  	// Lastly, we check if there is a replica that is not connected to primary node
   263  	disconnectedInstance, err := shard.disconnectedInstance()
   264  	if err != nil {
   265  		return DiagnoseTypeError, vterrors.Wrap(err, "fail to diagnose disconnectedInstance")
   266  	}
   267  	if disconnectedInstance != nil {
   268  		return DiagnoseTypeUnconnectedReplica, nil
   269  	}
   270  
   271  	// If we get here, shard is DiagnoseTypeHealthy
   272  	return DiagnoseTypeHealthy, nil
   273  }
   274  
   275  func (shard *GRShard) getLocalView() *db.GroupView {
   276  	localHostname, _ := os.Hostname()
   277  	localInst := shard.findTabletByHostAndPort(localHostname, shard.localDbPort)
   278  	if localInst == nil {
   279  		return nil
   280  	}
   281  	// TODO: consider using -db_socket to read local info
   282  	view, err := shard.dbAgent.FetchGroupView(localInst.alias, localInst.instanceKey)
   283  	// We still have the fallback logic if this failed, therefore we don't raise error
   284  	// but try to get local view with best effort
   285  	if err != nil {
   286  		shard.logger.Errorf("failed to fetch local group view: %v", err)
   287  	}
   288  	return view
   289  }
   290  
   291  func (shard *GRShard) fastPathDiagnose(ctx context.Context, view *db.GroupView) DiagnoseType {
   292  	pHost, pPort, isOnline := view.GetPrimaryView()
   293  	primaryTablet := shard.findShardPrimaryTablet()
   294  	if !isOnline || pHost == "" || pPort == 0 || primaryTablet == nil {
   295  		return diagnoseTypeUnknown
   296  	}
   297  	// VTGR will only bootstrap a group when it observes same number of views as group_size
   298  	// it means if we can find an ONLINE primary, we should be able to trust the view reported locally
   299  	// together with the primary tablet from topo server, we can determine:
   300  	// - if we need to failover vitess
   301  	// - if we need to failover mysql
   302  	if primaryTablet.instanceKey.Hostname != pHost || primaryTablet.instanceKey.Port != pPort {
   303  		// we find a mismatch but if the reported mysql primary is not in
   304  		// topology we should consider it as unreachable.
   305  		if shard.findTabletByHostAndPort(pHost, pPort) == nil {
   306  			return DiagnoseTypeUnreachablePrimary
   307  		}
   308  		return DiagnoseTypeWrongPrimaryTablet
   309  	}
   310  	if !shard.instanceReachable(ctx, primaryTablet) {
   311  		return DiagnoseTypeUnreachablePrimary
   312  	}
   313  	return diagnoseTypeUnknown
   314  }
   315  
   316  func (shard *GRShard) shardAgreedGroupName() string {
   317  	if len(shard.instances) == 0 {
   318  		return ""
   319  	}
   320  	return shard.sqlGroup.GetGroupName()
   321  }
   322  
   323  func (shard *GRShard) isAllOfflineOrError() bool {
   324  	return shard.sqlGroup.IsAllOfflineOrError()
   325  }
   326  
   327  func (shard *GRShard) getOnlineGroupInfo() (int, bool) {
   328  	return shard.sqlGroup.GetOnlineGroupInfo()
   329  }
   330  
   331  func (shard *GRShard) hasWrongPrimaryTablet(ctx context.Context) (bool, error) {
   332  	// Find out the hostname and port of the primary in mysql group
   333  	// we try to use local instance and then fallback to a random instance to check mysqld
   334  	// in case the primary is unreachable
   335  	host, port, _ := shard.sqlGroup.GetPrimary()
   336  	if !isHostPortValid(host, port) {
   337  		shard.logger.Warningf("Invalid address for primary %v:%v", host, port)
   338  		return false, errMissingGroup
   339  	}
   340  	// Make sure we have a tablet available
   341  	// findTabletByHostAndPort returns nil when we cannot find a tablet
   342  	// that is running on host:port, which means the tablet get stuck
   343  	// or when the tablet is not reachable
   344  	// we retrun errMissingPrimaryTablet so that VTGR will trigger a failover
   345  	tablet := shard.findTabletByHostAndPort(host, port)
   346  	if tablet == nil || !shard.instanceReachable(ctx, tablet) {
   347  		shard.logger.Errorf("Failed to find tablet that is running with mysql on %v:%v", host, port)
   348  		return false, errMissingPrimaryTablet
   349  	}
   350  	// Now we know we have a valid mysql primary in the group
   351  	// we should make sure tablets are aligned with it
   352  	primary := shard.findShardPrimaryTablet()
   353  	// If we failed to find primary for shard, it mostly means we are initializing the shard
   354  	// return true directly so that VTGR will set primary tablet according to MySQL group
   355  	if primary == nil {
   356  		shard.logger.Infof("unable to find primary tablet for %v", formatKeyspaceShard(shard.KeyspaceShard))
   357  		return true, nil
   358  	}
   359  	return (host != primary.instanceKey.Hostname) || (port != primary.instanceKey.Port), nil
   360  }
   361  
   362  func (shard *GRShard) isPrimaryReachable(ctx context.Context) (bool, error) {
   363  	primaryTablet := shard.findShardPrimaryTablet()
   364  	if primaryTablet == nil {
   365  		return false, fmt.Errorf("unable to find primary for %v", formatKeyspaceShard(shard.KeyspaceShard))
   366  	}
   367  	return shard.instanceReachable(ctx, primaryTablet), nil
   368  }
   369  
   370  func (shard *GRShard) instanceReachable(ctx context.Context, instance *grInstance) bool {
   371  	pingCtx, cancel := context.WithTimeout(context.Background(), pingTabletTimeout)
   372  	defer cancel()
   373  	c := make(chan error, 1)
   374  	// tmc.Ping create grpc client connection first without timeout via dial
   375  	// then call the grpc endpoint using the context with timeout
   376  	// this is problematic if the host is really unreachable, we have to wait the
   377  	// all the retries inside grpc.dial with exponential backoff
   378  	go func() { c <- shard.tmc.Ping(pingCtx, instance.tablet) }()
   379  	select {
   380  	case <-pingCtx.Done():
   381  		shard.logger.Errorf("Ping abort timeout %v", pingTabletTimeout)
   382  		return false
   383  	case err := <-c:
   384  		if err != nil {
   385  			shard.logger.Errorf("Ping error host=%v: %v", instance.instanceKey.Hostname, err)
   386  		}
   387  		return err == nil
   388  	}
   389  }
   390  
   391  // findShardPrimaryTablet returns the primary for the shard
   392  // it is either based on shard info from global topo or based on tablet types
   393  // from local topo
   394  func (shard *GRShard) findShardPrimaryTablet() *grInstance {
   395  	var primaryInstance *grInstance
   396  	for _, instance := range shard.instances {
   397  		if shard.primaryAlias == instance.alias {
   398  			return instance
   399  		}
   400  	}
   401  	return primaryInstance
   402  }
   403  
   404  func (shard *GRShard) primaryTabletAlias() string {
   405  	primary := shard.findShardPrimaryTablet()
   406  	if primary == nil {
   407  		return "UNKNOWN"
   408  	}
   409  	return primary.alias
   410  }
   411  
   412  // disconnectedInstance iterates all known the replica records
   413  // and checks mysql to see if the group replication is setup on it
   414  func (shard *GRShard) disconnectedInstance() (*grInstance, error) {
   415  	primaryInstance := shard.findShardPrimaryTablet()
   416  	// if there is no primary, we should recover from DiagnoseTypeWrongPrimaryTablet
   417  	if primaryInstance == nil {
   418  		return nil, fmt.Errorf("%v does not have primary", formatKeyspaceShard(shard.KeyspaceShard))
   419  	}
   420  	// Up to this check, we know:
   421  	// - shard has an agreed group
   422  	// - shard has a primary tablet
   423  	// - shard primary tablet is running on the same node as mysql
   424  	rand.Shuffle(len(shard.instances), func(i, j int) {
   425  		shard.instances[i], shard.instances[j] = shard.instances[j], shard.instances[i]
   426  	})
   427  	for _, instance := range shard.instances {
   428  		// Skip instance without hostname because they are not up and running
   429  		// also skip instances that raised unrecoverable errors
   430  		if shard.shardStatusCollector.isUnreachable(instance) {
   431  			shard.logger.Infof("Skip %v to check disconnectedInstance because it is unhealthy", instance.alias)
   432  			continue
   433  		}
   434  		isUnconnected := shard.sqlGroup.IsUnconnectedReplica(instance.instanceKey)
   435  		if isUnconnected {
   436  			return instance, nil
   437  		}
   438  	}
   439  	return nil, nil
   440  }
   441  
   442  func (recorder *groupGTIDRecorder) recordGroupStatus(name string, isActive bool) error {
   443  	recorder.Lock()
   444  	defer recorder.Unlock()
   445  	if recorder.name != "" && recorder.name != name {
   446  		return fmt.Errorf("group has more than one group name")
   447  	}
   448  	recorder.name = name
   449  	// hasActive records true if any node finds an active member
   450  	if isActive {
   451  		recorder.hasActive = true
   452  	}
   453  	return nil
   454  }
   455  
   456  func (recorder *groupGTIDRecorder) recordGroupGTIDs(gtids mysql.GTIDSet, instance *grInstance) {
   457  	recorder.Lock()
   458  	defer recorder.Unlock()
   459  	recorder.gtidWithInstances = append(recorder.gtidWithInstances, &instanceGTIDSet{gtids: gtids, instance: instance})
   460  }
   461  
   462  func (recorder *groupGTIDRecorder) sort() {
   463  	sort.SliceStable(recorder.gtidWithInstances, func(i, j int) bool {
   464  		return recorder.gtidWithInstances[i].instance.alias < recorder.gtidWithInstances[j].instance.alias
   465  	})
   466  }
   467  
   468  func (collector *shardStatusCollector) recordDiagnoseResult(result DiagnoseType) {
   469  	collector.Lock()
   470  	defer collector.Unlock()
   471  	collector.status.DiagnoseResult = result
   472  }
   473  
   474  func (collector *shardStatusCollector) recordUnreachables(instance *grInstance) {
   475  	collector.Lock()
   476  	defer collector.Unlock()
   477  	// dedup
   478  	// the list size is at most same as number instances in a shard so iterate to dedup is not terrible
   479  	for _, alias := range collector.status.Unreachables {
   480  		if alias == instance.alias {
   481  			return
   482  		}
   483  	}
   484  	collector.status.Unreachables = append(collector.status.Unreachables, instance.alias)
   485  }
   486  
   487  func (collector *shardStatusCollector) clear() {
   488  	collector.Lock()
   489  	defer collector.Unlock()
   490  	collector.status.Unreachables = nil
   491  	collector.status.Problematics = nil
   492  }
   493  
   494  func (collector *shardStatusCollector) recordProblematics(instance *grInstance) {
   495  	collector.Lock()
   496  	defer collector.Unlock()
   497  	// dedup
   498  	// the list size is at most same as number instances in a shard so iterate to dedup is not terrible
   499  	for _, alias := range collector.status.Problematics {
   500  		if alias == instance.alias {
   501  			return
   502  		}
   503  	}
   504  	collector.status.Problematics = append(collector.status.Problematics, instance.alias)
   505  }
   506  
   507  func formatKeyspaceShard(keyspaceShard *topo.KeyspaceShard) string {
   508  	return fmt.Sprintf("%v/%v", keyspaceShard.Keyspace, keyspaceShard.Shard)
   509  }
   510  
   511  func isHostPortValid(host string, port int) bool {
   512  	return host != "" && port != 0
   513  }
   514  
   515  // We use forAllInstances in two cases:
   516  // 1. FetchGroupView GTIDs to find a candidate for failover.
   517  // If a node is not healthy it should not be considered as a failover candidate
   518  //
   519  // 2. FetchGroupView group member status to see if we need to bootstrap a group,
   520  // either for the first time or rebuild a group after all the nodes are died.
   521  //
   522  // caller will be responsible to decide if they want to tolerate errors from the forAllInstances call
   523  func (shard *GRShard) forAllInstances(task func(instance *grInstance, wg *sync.WaitGroup, er concurrency.ErrorRecorder)) *concurrency.AllErrorRecorder {
   524  	errorRecord := concurrency.AllErrorRecorder{}
   525  	shard.shardStatusCollector.clear()
   526  	var wg sync.WaitGroup
   527  	for _, instance := range shard.instances {
   528  		wg.Add(1)
   529  		go task(instance, &wg, &errorRecord)
   530  	}
   531  	wg.Wait()
   532  	if len(errorRecord.Errors) > 0 {
   533  		shard.logger.Errorf("get errors in forAllInstances call: %v", errorRecord.Error())
   534  	}
   535  	return &errorRecord
   536  }
   537  
   538  func unreachableError(err error) bool {
   539  	contains := []string{
   540  		// "no such host"/"no route to host" is the error when a host is not reachalbe
   541  		"no such host",
   542  		"no route to host",
   543  		// "connect: connection refused" is the error when a mysqld refused the connection
   544  		"connect: connection refused",
   545  		// "invalid mysql instance key" is the error when a tablet does not populate mysql hostname or port
   546  		// this can happen if the tablet crashed. We keep them in the grShard.instances list to compute
   547  		// quorum but consider it as an unreachable host.
   548  		"invalid mysql instance key",
   549  	}
   550  	for _, k := range contains {
   551  		if strings.Contains(err.Error(), k) {
   552  			return true
   553  		}
   554  	}
   555  	return false
   556  }
   557  
   558  // refreshSQLGroup hits all instances and renders a SQL group locally for later diagnoses
   559  // the SQL group contains a list of "views" for the group from all the available nodes
   560  func (shard *GRShard) refreshSQLGroup() error {
   561  	// reset views in sql group
   562  	shard.sqlGroup.clear()
   563  	er := shard.forAllInstances(func(instance *grInstance, wg *sync.WaitGroup, er concurrency.ErrorRecorder) {
   564  		defer wg.Done()
   565  		view, err := shard.dbAgent.FetchGroupView(instance.alias, instance.instanceKey)
   566  		// We just log error here because we rely on mysql tells us if it is happy or not
   567  		// If the node is unreachable
   568  		if err != nil {
   569  			er.RecordError(err)
   570  			shard.shardStatusCollector.recordProblematics(instance)
   571  			if unreachableError(err) {
   572  				shard.shardStatusCollector.recordUnreachables(instance)
   573  			}
   574  			shard.logger.Errorf("%v get error while fetch group info: %v", instance.alias, err)
   575  			return
   576  		}
   577  		shard.sqlGroup.recordView(view)
   578  	})
   579  	// Only raise error if we failed to get any data from mysql
   580  	// otherwise, we will use what we get from mysql directly
   581  	if len(er.Errors) == len(shard.instances) {
   582  		shard.logger.Errorf("fail to fetch any data for mysql")
   583  		return db.ErrGroupBackoffError
   584  	}
   585  	return shard.sqlGroup.Resolve()
   586  }