vitess.io/vitess@v0.16.2/go/vt/vttablet/tabletmanager/shard_sync.go (about)

     1  /*
     2  Copyright 2019 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package tabletmanager
    18  
    19  import (
    20  	"context"
    21  	"time"
    22  
    23  	"github.com/spf13/pflag"
    24  
    25  	"vitess.io/vitess/go/vt/log"
    26  	"vitess.io/vitess/go/vt/logutil"
    27  	"vitess.io/vitess/go/vt/mysqlctl"
    28  	"vitess.io/vitess/go/vt/servenv"
    29  	"vitess.io/vitess/go/vt/topo"
    30  	"vitess.io/vitess/go/vt/topo/topoproto"
    31  	"vitess.io/vitess/go/vt/vterrors"
    32  
    33  	topodatapb "vitess.io/vitess/go/vt/proto/topodata"
    34  )
    35  
    36  var shardSyncRetryDelay = 30 * time.Second
    37  
    38  func registerShardSyncFlags(fs *pflag.FlagSet) {
    39  	fs.DurationVar(&shardSyncRetryDelay, "shard_sync_retry_delay", shardSyncRetryDelay, "delay between retries of updates to keep the tablet and its shard record in sync")
    40  }
    41  
    42  func init() {
    43  	servenv.OnParseFor("vtcombo", registerShardSyncFlags)
    44  	servenv.OnParseFor("vttablet", registerShardSyncFlags)
    45  }
    46  
    47  // shardSyncLoop is a loop that tries to keep the tablet state and the
    48  // shard record in sync.
    49  //
    50  // It is launched as a background goroutine in the tablet because it may need to
    51  // initiate a tablet state change in response to an incoming watch event for the
    52  // shard record, and it may need to continually retry updating the shard record
    53  // if it's out of sync with the tablet state. At steady state, when the tablet
    54  // and shard record are in sync, this goroutine goes to sleep waiting for
    55  // something to change in either the tablet state or in the shard record.
    56  //
    57  // This goroutine gets woken up for shard record changes by maintaining a
    58  // topo watch on the shard record. It gets woken up for tablet state changes by
    59  // a notification signal from setTablet().
    60  func (tm *TabletManager) shardSyncLoop(ctx context.Context, notifyChan <-chan struct{}, doneChan chan<- struct{}) {
    61  	defer close(doneChan)
    62  
    63  	// retryChan is how we wake up after going to sleep between retries.
    64  	// If no retry is pending, this channel will be nil, which means it's fine
    65  	// to always select on it -- a nil channel is never ready.
    66  	var retryChan <-chan time.Time
    67  
    68  	// shardWatch is how we get notified when the shard record is updated.
    69  	// We only watch the shard record while we are primary.
    70  	shardWatch := &shardWatcher{}
    71  	defer shardWatch.stop()
    72  
    73  	// This loop sleeps until it's notified that something may have changed.
    74  	// Then it wakes up to check if anything needs to be synchronized.
    75  	for {
    76  		select {
    77  		case <-notifyChan:
    78  			// Something may have changed in the tablet state.
    79  			log.Info("Change to tablet state")
    80  		case <-retryChan:
    81  			// It's time to retry a previous failed sync attempt.
    82  			log.Info("Retry sync")
    83  		case event := <-shardWatch.watchChan:
    84  			// Something may have changed in the shard record.
    85  			// We don't use the watch event except to know that we should
    86  			// re-read the shard record, and to know if the watch dies.
    87  			log.Info("Change in shard record")
    88  			if event.Err != nil {
    89  				// The watch failed. Stop it so we start a new one if needed.
    90  				log.Errorf("Shard watch failed: %v", event.Err)
    91  				shardWatch.stop()
    92  			}
    93  		case <-ctx.Done():
    94  			// Our context was cancelled. Terminate the loop.
    95  			return
    96  		}
    97  
    98  		// Disconnect any pending retry timer since we're already retrying for
    99  		// another reason.
   100  		retryChan = nil
   101  
   102  		// Get the latest internal tablet value, representing what we think we are.
   103  		tablet := tm.Tablet()
   104  
   105  		switch tablet.Type {
   106  		case topodatapb.TabletType_PRIMARY:
   107  			// This is a failsafe code because we've seen races that can cause
   108  			// primary term start time to become zero.
   109  			if tablet.PrimaryTermStartTime == nil {
   110  				log.Errorf("PrimaryTermStartTime should not be nil: %v", tablet)
   111  				// Start retry timer and go back to sleep.
   112  				retryChan = time.After(shardSyncRetryDelay)
   113  				continue
   114  			}
   115  			// If we think we're primary, check if we need to update the shard record.
   116  			// Fetch the start time from the record we just got, because the tm's tablet can change.
   117  			primaryAlias, shouldDemote, err := syncShardPrimary(ctx, tm.TopoServer, tablet, logutil.ProtoToTime(tablet.PrimaryTermStartTime))
   118  			if err != nil {
   119  				log.Errorf("Failed to sync shard record: %v", err)
   120  				// Start retry timer and go back to sleep.
   121  				retryChan = time.After(shardSyncRetryDelay)
   122  				continue
   123  			}
   124  			if shouldDemote {
   125  				// Someone updated the PrimaryTermStartTime while we still think we're primary.
   126  				// This means that we should end our term, since someone else must have claimed primaryship
   127  				// and wrote to the shard record
   128  				if err := tm.endPrimaryTerm(ctx, primaryAlias); err != nil {
   129  					log.Errorf("Failed to end primary term: %v", err)
   130  					// Start retry timer and go back to sleep.
   131  					retryChan = time.After(shardSyncRetryDelay)
   132  					continue
   133  				}
   134  				// We're not primary anymore, so stop watching the shard record.
   135  				shardWatch.stop()
   136  				continue
   137  			}
   138  
   139  			// As long as we're primary, watch the shard record so we'll be
   140  			// notified if another primary takes over.
   141  			if shardWatch.active() {
   142  				// We already have an active watch. Nothing to do.
   143  				continue
   144  			}
   145  			if err := shardWatch.start(tm.TopoServer, tablet.Keyspace, tablet.Shard); err != nil {
   146  				log.Errorf("Failed to start shard watch: %v", err)
   147  				// Start retry timer and go back to sleep.
   148  				retryChan = time.After(shardSyncRetryDelay)
   149  				continue
   150  			}
   151  		default:
   152  			// If we're not primary, stop watching the shard record,
   153  			// so only primaries contribute to global topo watch load.
   154  			shardWatch.stop()
   155  		}
   156  	}
   157  }
   158  
   159  // syncShardPrimary is called when we think we're primary.
   160  // It checks that the shard record agrees, and updates it if possible.
   161  //
   162  // If the returned error is nil, the returned primaryAlias indicates the current
   163  // primary tablet according to the shard record.
   164  //
   165  // If the shard record indicates a new primary has taken over, this returns
   166  // success (we successfully synchronized), but the returned primaryAlias will be
   167  // different from the input tablet.Alias.
   168  func syncShardPrimary(ctx context.Context, ts *topo.Server, tablet *topodatapb.Tablet, PrimaryTermStartTime time.Time) (primaryAlias *topodatapb.TabletAlias, shouldDemote bool, err error) {
   169  	ctx, cancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout)
   170  	defer cancel()
   171  
   172  	var shardInfo *topo.ShardInfo
   173  	shouldDemote = false
   174  	_, err = ts.UpdateShardFields(ctx, tablet.Keyspace, tablet.Shard, func(si *topo.ShardInfo) error {
   175  		lastTerm := si.GetPrimaryTermStartTime()
   176  
   177  		// Save the ShardInfo so we can check it afterward.
   178  		// We can't use the return value of UpdateShardFields because it might be nil.
   179  		shardInfo = si
   180  
   181  		if PrimaryTermStartTime.Before(lastTerm) {
   182  			// In this case, we have outdated information, we will demote ourself
   183  			shouldDemote = true
   184  		}
   185  
   186  		// Only attempt an update if our term is more recent.
   187  		if !PrimaryTermStartTime.After(lastTerm) {
   188  			return topo.NewError(topo.NoUpdateNeeded, si.ShardName())
   189  		}
   190  
   191  		aliasStr := topoproto.TabletAliasString(tablet.Alias)
   192  		log.Infof("Updating shard record: primary_alias=%v, primary_term_start_time=%v", aliasStr, PrimaryTermStartTime)
   193  		si.PrimaryAlias = tablet.Alias
   194  		si.PrimaryTermStartTime = logutil.TimeToProto(PrimaryTermStartTime)
   195  		return nil
   196  	})
   197  	if err != nil {
   198  		return nil, shouldDemote, err
   199  	}
   200  
   201  	return shardInfo.PrimaryAlias, shouldDemote, nil
   202  }
   203  
   204  // endPrimaryTerm is called when we unexpectedly lost primaryship.
   205  //
   206  // Under normal circumstances, we should be gracefully demoted before a new
   207  // primary appears. This function is only reached when that graceful demotion
   208  // failed or was skipped, so we only found out we're no longer primary after the
   209  // new primary started advertising itself.
   210  //
   211  // If active reparents are enabled, we demote our own MySQL to a replica and
   212  // update our tablet type to REPLICA.
   213  //
   214  // If active reparents are disabled, we don't touch our MySQL.
   215  // We just directly update our tablet type to REPLICA.
   216  func (tm *TabletManager) endPrimaryTerm(ctx context.Context, primaryAlias *topodatapb.TabletAlias) error {
   217  	primaryAliasStr := topoproto.TabletAliasString(primaryAlias)
   218  	log.Warningf("Another tablet (%v) has won primary election. Stepping down to %v.", primaryAliasStr, tm.baseTabletType)
   219  
   220  	if mysqlctl.DisableActiveReparents {
   221  		// Don't touch anything at the MySQL level. Just update tablet state.
   222  		log.Infof("Active reparents are disabled; updating tablet state only.")
   223  		changeTypeCtx, cancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout)
   224  		defer cancel()
   225  		if err := tm.tmState.ChangeTabletType(changeTypeCtx, tm.baseTabletType, DBActionNone); err != nil {
   226  			return vterrors.Wrapf(err, "failed to change type to %v", tm.baseTabletType)
   227  		}
   228  		return nil
   229  	}
   230  
   231  	// Do a full demotion to convert MySQL into a replica.
   232  	// We do not revert on partial failure here because this code path only
   233  	// triggers after a new primary has taken over, so we are past the point of
   234  	// no return. Instead, we should leave partial results and retry the rest
   235  	// later.
   236  	log.Infof("Active reparents are enabled; converting MySQL to replica.")
   237  	demotePrimaryCtx, cancelDemotePrimary := context.WithTimeout(ctx, topo.RemoteOperationTimeout)
   238  	defer cancelDemotePrimary()
   239  	if _, err := tm.demotePrimary(demotePrimaryCtx, false /* revertPartialFailure */); err != nil {
   240  		return vterrors.Wrap(err, "failed to demote primary")
   241  	}
   242  	setPrimaryCtx, cancelSetPrimary := context.WithTimeout(ctx, topo.RemoteOperationTimeout)
   243  	defer cancelSetPrimary()
   244  	log.Infof("Attempting to reparent self to new primary %v.", primaryAliasStr)
   245  	if primaryAlias == nil {
   246  		if err := tm.tmState.ChangeTabletType(ctx, topodatapb.TabletType_REPLICA, DBActionNone); err != nil {
   247  			return err
   248  		}
   249  	} else {
   250  		if err := tm.setReplicationSourceSemiSyncNoAction(setPrimaryCtx, primaryAlias, 0, "", true); err != nil {
   251  			return vterrors.Wrap(err, "failed to reparent self to new primary")
   252  		}
   253  	}
   254  	return nil
   255  }
   256  
   257  func (tm *TabletManager) startShardSync() {
   258  	// Use a buffer size of 1 so we can remember we need to check the state
   259  	// even if the receiver is busy. We can drop any additional send attempts
   260  	// if the buffer is full because all we care about is that the receiver will
   261  	// be told it needs to recheck the state.
   262  	tm.mutex.Lock()
   263  	defer tm.mutex.Unlock()
   264  	tm._shardSyncChan = make(chan struct{}, 1)
   265  	tm._shardSyncDone = make(chan struct{})
   266  	ctx, cancel := context.WithCancel(context.Background())
   267  	tm._shardSyncCancel = cancel
   268  
   269  	// Start the sync loop in the background.
   270  	go tm.shardSyncLoop(ctx, tm._shardSyncChan, tm._shardSyncDone)
   271  
   272  	// Queue up a pending notification to force the loop to run once at startup.
   273  	go tm.notifyShardSync()
   274  }
   275  
   276  func (tm *TabletManager) stopShardSync() {
   277  	var doneChan <-chan struct{}
   278  
   279  	tm.mutex.Lock()
   280  	if tm._shardSyncCancel != nil {
   281  		tm._shardSyncCancel()
   282  	}
   283  	doneChan = tm._shardSyncDone
   284  	tm.mutex.Unlock()
   285  
   286  	// If the shard sync loop was running, wait for it to fully stop.
   287  	if doneChan != nil {
   288  		<-doneChan
   289  	}
   290  }
   291  
   292  func (tm *TabletManager) notifyShardSync() {
   293  	// If this is called before the shard sync is started, do nothing.
   294  	tm.mutex.Lock()
   295  	defer tm.mutex.Unlock()
   296  
   297  	if tm._shardSyncChan == nil {
   298  		return
   299  	}
   300  
   301  	// Try to send. If the channel buffer is full, it means a notification is
   302  	// already pending, so we don't need to do anything.
   303  	select {
   304  	case tm._shardSyncChan <- struct{}{}:
   305  	default:
   306  	}
   307  }