vitess.io/vitess@v0.16.2/go/vt/vtgr/controller/refresh.go (about)

     1  /*
     2  Copyright 2021 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package controller
    18  
    19  import (
    20  	"fmt"
    21  	"strconv"
    22  	"sync"
    23  	"time"
    24  
    25  	"vitess.io/vitess/go/vt/topo/topoproto"
    26  
    27  	"golang.org/x/net/context"
    28  
    29  	"vitess.io/vitess/go/stats"
    30  	"vitess.io/vitess/go/sync2"
    31  	"vitess.io/vitess/go/vt/logutil"
    32  	topodatapb "vitess.io/vitess/go/vt/proto/topodata"
    33  	"vitess.io/vitess/go/vt/topo"
    34  	"vitess.io/vitess/go/vt/vtgr/config"
    35  	"vitess.io/vitess/go/vt/vtgr/db"
    36  	"vitess.io/vitess/go/vt/vtgr/inst"
    37  	"vitess.io/vitess/go/vt/vtgr/log"
    38  )
    39  
    40  var (
    41  	lockShardTimingsMs = stats.NewMultiTimings("lockShard", "time vtgr takes to lock the shard", []string{"operation", "success"})
    42  )
    43  
    44  // grInstance represents an instance that's running MySQL GR
    45  // it wraps a InstanceKey plus some tablet related information
    46  type grInstance struct {
    47  	instanceKey      *inst.InstanceKey
    48  	tablet           *topodatapb.Tablet
    49  	primaryTimeStamp time.Time
    50  	alias            string
    51  }
    52  
    53  // GRTopo is VTGR wrapper for topo server
    54  type GRTopo interface {
    55  	GetShardNames(ctx context.Context, keyspace string) ([]string, error)
    56  	GetShard(ctx context.Context, keyspace, shard string) (*topo.ShardInfo, error)
    57  	GetTabletMapForShardByCell(ctx context.Context, keyspace, shard string, cells []string) (map[string]*topo.TabletInfo, error)
    58  	LockShard(ctx context.Context, keyspace, shard, action string) (context.Context, func(*error), error)
    59  }
    60  
    61  // GRTmcClient is VTGR wrapper for tmc client
    62  type GRTmcClient interface {
    63  	ChangeType(ctx context.Context, tablet *topodatapb.Tablet, dbType topodatapb.TabletType, semiSync bool) error
    64  	Ping(ctx context.Context, tablet *topodatapb.Tablet) error
    65  }
    66  
    67  // GRShard stores the information about a Vitess shard that's running MySQL GR
    68  type GRShard struct {
    69  	KeyspaceShard        *topo.KeyspaceShard
    70  	cells                []string
    71  	instances            []*grInstance
    72  	primaryAlias         string
    73  	shardStatusCollector *shardStatusCollector
    74  	sqlGroup             *SQLGroup
    75  	ts                   GRTopo
    76  	tmc                  GRTmcClient
    77  	dbAgent              db.Agent
    78  
    79  	// Every GRShard tracks a unlock function after it grab a topo lock for the shard
    80  	// VTGR needs to release the topo lock before gracefully shutdown
    81  	unlock func(*error)
    82  	// mutex to protect unlock function access
    83  	unlockMu sync.Mutex
    84  
    85  	// configuration
    86  	minNumReplicas            int
    87  	localDbPort               int
    88  	disableReadOnlyProtection bool
    89  
    90  	transientErrorWaitTime time.Duration
    91  	bootstrapWaitTime      time.Duration
    92  
    93  	lastDiagnoseResult DiagnoseType
    94  	lastDiagnoseSince  time.Time
    95  
    96  	isActive sync2.AtomicBool
    97  
    98  	logger *log.Logger
    99  
   100  	// lock prevents multiple go routine fights with each other
   101  	sync.Mutex
   102  }
   103  
   104  // shardStatusCollector is used for collecting shard status
   105  type shardStatusCollector struct {
   106  	status *ShardStatus
   107  	sync.Mutex
   108  }
   109  
   110  // ShardStatus is used for debugging purpose to get current status of a shard
   111  type ShardStatus struct {
   112  	Keyspace       string
   113  	Shard          string
   114  	Instances      []string
   115  	Unreachables   []string
   116  	Problematics   []string
   117  	Primary        string
   118  	DiagnoseResult DiagnoseType
   119  }
   120  
   121  func newShardStatusCollector(keyspace, shard string) *shardStatusCollector {
   122  	return &shardStatusCollector{
   123  		status: &ShardStatus{Keyspace: keyspace, Shard: shard},
   124  	}
   125  }
   126  
   127  // NewGRShard creates a new GRShard
   128  func NewGRShard(
   129  	keyspace, shard string,
   130  	cells []string,
   131  	tmc GRTmcClient,
   132  	ts GRTopo,
   133  	dbAgent db.Agent,
   134  	config *config.VTGRConfig,
   135  	localDbPort int,
   136  	isActive bool) *GRShard {
   137  	grShard := &GRShard{
   138  		KeyspaceShard:             &topo.KeyspaceShard{Keyspace: keyspace, Shard: shard},
   139  		cells:                     cells,
   140  		shardStatusCollector:      newShardStatusCollector(keyspace, shard),
   141  		tmc:                       tmc,
   142  		ts:                        ts,
   143  		dbAgent:                   dbAgent,
   144  		unlock:                    nil,
   145  		sqlGroup:                  NewSQLGroup(config.BootstrapGroupSize, true, keyspace, shard),
   146  		minNumReplicas:            config.MinNumReplica,
   147  		disableReadOnlyProtection: config.DisableReadOnlyProtection,
   148  		localDbPort:               localDbPort,
   149  		logger:                    log.NewVTGRLogger(keyspace, shard),
   150  		transientErrorWaitTime:    time.Duration(config.BackoffErrorWaitTimeSeconds) * time.Second,
   151  		bootstrapWaitTime:         time.Duration(config.BootstrapWaitTimeSeconds) * time.Second,
   152  	}
   153  	grShard.isActive.Set(isActive)
   154  	return grShard
   155  }
   156  
   157  // refreshTabletsInShardLocked is called by repair to get a fresh view of the shard
   158  // The caller is responsible to make sure the lock on GRShard
   159  func (shard *GRShard) refreshTabletsInShardLocked(ctx context.Context) {
   160  	instances, err := shard.refreshTabletsInShardInternal(ctx)
   161  	if err == nil {
   162  		shard.instances = instances
   163  	}
   164  	primary, err := shard.refreshPrimaryShard(ctx)
   165  	if err == nil {
   166  		shard.primaryAlias = primary
   167  		return
   168  	}
   169  	// If we failed to refreshPrimaryShard, use primary from local tablets
   170  	shard.primaryAlias = shard.findPrimaryFromLocalCell()
   171  }
   172  
   173  // UpdateTabletsInShardWithLock updates the shard instances with a lock
   174  func (shard *GRShard) UpdateTabletsInShardWithLock(ctx context.Context) {
   175  	instances, err := shard.refreshTabletsInShardInternal(ctx)
   176  	if err == nil {
   177  		// Take a per shard lock here when we actually refresh the data to avoid
   178  		// race conditions bewteen controller and repair tasks
   179  		shard.Lock()
   180  		shard.instances = instances
   181  		shard.Unlock()
   182  	}
   183  	primary, err := shard.refreshPrimaryShard(ctx)
   184  	// We set primary separately from instances so that if global topo is not available
   185  	// VTGR can still discover the new tablets from local cell
   186  	shard.Lock()
   187  	defer shard.Unlock()
   188  	if err == nil {
   189  		shard.primaryAlias = primary
   190  		return
   191  	}
   192  	shard.primaryAlias = shard.findPrimaryFromLocalCell()
   193  }
   194  
   195  func (shard *GRShard) refreshTabletsInShardInternal(ctx context.Context) ([]*grInstance, error) {
   196  	keyspace, shardName := shard.KeyspaceShard.Keyspace, shard.KeyspaceShard.Shard
   197  	tablets, err := shard.ts.GetTabletMapForShardByCell(ctx, keyspace, shardName, shard.cells)
   198  	if err != nil {
   199  		shard.logger.Errorf("Error fetching tablets for keyspace/shardName %v/%v: %v", keyspace, shardName, err)
   200  		return nil, err
   201  	}
   202  	return parseTabletInfos(tablets), nil
   203  }
   204  
   205  func (shard *GRShard) refreshPrimaryShard(ctx context.Context) (string, error) {
   206  	keyspace, shardName := shard.KeyspaceShard.Keyspace, shard.KeyspaceShard.Shard
   207  	si, err := shard.ts.GetShard(ctx, keyspace, shardName)
   208  	if err != nil {
   209  		shard.logger.Errorf("Error calling GetShard: %v", err)
   210  		return "", err
   211  	}
   212  	return topoproto.TabletAliasString(si.PrimaryAlias), nil
   213  }
   214  
   215  // findPrimaryFromLocalCell iterates through the replicas stored in grShard and returns
   216  // the one that's marked as primary
   217  func (shard *GRShard) findPrimaryFromLocalCell() string {
   218  	var latestPrimaryTimestamp time.Time
   219  	var primaryInstance *grInstance
   220  	for _, instance := range shard.instances {
   221  		if instance.tablet.Type == topodatapb.TabletType_PRIMARY {
   222  			// It is possible that there are more than one master in topo server
   223  			// we should compare timestamp to pick the latest one
   224  			if latestPrimaryTimestamp.Before(instance.primaryTimeStamp) {
   225  				latestPrimaryTimestamp = instance.primaryTimeStamp
   226  				primaryInstance = instance
   227  			}
   228  		}
   229  	}
   230  	if primaryInstance != nil {
   231  		return primaryInstance.alias
   232  	}
   233  	return ""
   234  }
   235  
   236  // parseTabletInfos replaces the replica reports for the shard key
   237  // Note: this is not thread-safe
   238  func parseTabletInfos(tablets map[string]*topo.TabletInfo) []*grInstance {
   239  	// collect all replicas
   240  	var newReplicas []*grInstance
   241  	for alias, tabletInfo := range tablets {
   242  		tablet := tabletInfo.Tablet
   243  		// Only monitor primary, replica and ronly tablet types
   244  		switch tablet.Type {
   245  		case topodatapb.TabletType_PRIMARY, topodatapb.TabletType_REPLICA, topodatapb.TabletType_RDONLY:
   246  			// mysql hostname and port might be empty here if tablet is not running
   247  			// we will treat them as unreachable
   248  			instanceKey := inst.InstanceKey{
   249  				Hostname: tablet.MysqlHostname,
   250  				Port:     int(tablet.MysqlPort),
   251  			}
   252  			grInstance := grInstance{
   253  				instanceKey:      &instanceKey,
   254  				tablet:           tablet,
   255  				primaryTimeStamp: logutil.ProtoToTime(tablet.PrimaryTermStartTime),
   256  				alias:            alias,
   257  			}
   258  			newReplicas = append(newReplicas, &grInstance)
   259  		}
   260  	}
   261  	return newReplicas
   262  }
   263  
   264  // LockShard locks the keyspace-shard on topo server to prevent others from executing conflicting actions.
   265  func (shard *GRShard) LockShard(ctx context.Context, action string) (context.Context, error) {
   266  	if shard.KeyspaceShard.Keyspace == "" || shard.KeyspaceShard.Shard == "" {
   267  		return nil, fmt.Errorf("try to grab lock with incomplete information: %v/%v", shard.KeyspaceShard.Keyspace, shard.KeyspaceShard.Shard)
   268  	}
   269  	shard.unlockMu.Lock()
   270  	defer shard.unlockMu.Unlock()
   271  	if shard.unlock != nil {
   272  		return nil, fmt.Errorf("try to grab lock for %s/%s while the shard holds an unlock function", shard.KeyspaceShard.Keyspace, shard.KeyspaceShard.Shard)
   273  	}
   274  	start := time.Now()
   275  	ctx, unlock, err := shard.ts.LockShard(ctx, shard.KeyspaceShard.Keyspace, shard.KeyspaceShard.Shard, fmt.Sprintf("VTGR repairing %s", action))
   276  	lockShardTimingsMs.Record([]string{action, strconv.FormatBool(err == nil)}, start)
   277  	if err != nil {
   278  		return nil, err
   279  	}
   280  	shard.unlock = unlock
   281  	return ctx, nil
   282  }
   283  
   284  // UnlockShard unlocks the keyspace-shard on topo server
   285  // and set the unlock function to nil in the container
   286  func (shard *GRShard) UnlockShard() {
   287  	shard.unlockMu.Lock()
   288  	defer shard.unlockMu.Unlock()
   289  	if shard.unlock == nil {
   290  		shard.logger.Warningf("Shard %s/%s does not hold a lock", shard.KeyspaceShard.Keyspace, shard.KeyspaceShard.Shard)
   291  		return
   292  	}
   293  	var err error
   294  	shard.unlock(&err)
   295  	shard.unlock = nil
   296  }
   297  
   298  func (shard *GRShard) findTabletByHostAndPort(host string, port int) *grInstance {
   299  	for _, instance := range shard.instances {
   300  		if instance.instanceKey.Hostname == host && instance.instanceKey.Port == port {
   301  			return instance
   302  		}
   303  	}
   304  	return nil
   305  }
   306  
   307  func (shard *GRShard) getToleratedNumError() int {
   308  	quorum := len(shard.instances)/2 + 1
   309  	return len(shard.instances) - quorum
   310  }
   311  
   312  func (shard *GRShard) populateVTGRStatusLocked() {
   313  	var instanceList []string
   314  	for _, instance := range shard.instances {
   315  		instanceList = append(instanceList, instance.alias)
   316  	}
   317  	shard.shardStatusCollector.status.Instances = instanceList
   318  	if primary := shard.findShardPrimaryTablet(); primary != nil {
   319  		shard.shardStatusCollector.status.Primary = primary.alias
   320  	}
   321  }
   322  
   323  // GetCurrentShardStatuses returns the status collector has
   324  func (shard *GRShard) GetCurrentShardStatuses() ShardStatus {
   325  	shard.Lock()
   326  	collector := shard.shardStatusCollector
   327  	// dereference status so that we return a copy of the struct
   328  	status := *collector.status
   329  	shard.Unlock()
   330  	return status
   331  }
   332  
   333  // OverrideRebootstrapGroupSize force override the group expectedBootstrapSize used in safety check for rebootstrap
   334  func (shard *GRShard) OverrideRebootstrapGroupSize(groupSize int) error {
   335  	shard.Lock()
   336  	defer shard.Unlock()
   337  	shard.logger.Infof("Override rebootstrap group size=%v", groupSize)
   338  	shard.sqlGroup.rebootstrapSize = groupSize
   339  	return nil
   340  }
   341  
   342  // GetUnlock returns the unlock function for the shard for testing
   343  func (shard *GRShard) GetUnlock() func(*error) {
   344  	shard.unlockMu.Lock()
   345  	defer shard.unlockMu.Unlock()
   346  	return shard.unlock
   347  }
   348  
   349  // SetIsActive sets isActive for the shard
   350  func (shard *GRShard) SetIsActive(isActive bool) {
   351  	shard.logger.Infof("Setting is active to %v", isActive)
   352  	shard.isActive.Set(isActive)
   353  }
   354  
   355  func (collector *shardStatusCollector) isUnreachable(instance *grInstance) bool {
   356  	if instance.instanceKey == nil || instance.instanceKey.Hostname == "" {
   357  		return true
   358  	}
   359  	for _, alias := range collector.status.Unreachables {
   360  		if instance.alias == alias {
   361  			return true
   362  		}
   363  	}
   364  	return false
   365  }