vitess.io/vitess@v0.16.2/go/vt/vttablet/tabletmanager/shard_sync.go (about) 1 /* 2 Copyright 2019 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package tabletmanager 18 19 import ( 20 "context" 21 "time" 22 23 "github.com/spf13/pflag" 24 25 "vitess.io/vitess/go/vt/log" 26 "vitess.io/vitess/go/vt/logutil" 27 "vitess.io/vitess/go/vt/mysqlctl" 28 "vitess.io/vitess/go/vt/servenv" 29 "vitess.io/vitess/go/vt/topo" 30 "vitess.io/vitess/go/vt/topo/topoproto" 31 "vitess.io/vitess/go/vt/vterrors" 32 33 topodatapb "vitess.io/vitess/go/vt/proto/topodata" 34 ) 35 36 var shardSyncRetryDelay = 30 * time.Second 37 38 func registerShardSyncFlags(fs *pflag.FlagSet) { 39 fs.DurationVar(&shardSyncRetryDelay, "shard_sync_retry_delay", shardSyncRetryDelay, "delay between retries of updates to keep the tablet and its shard record in sync") 40 } 41 42 func init() { 43 servenv.OnParseFor("vtcombo", registerShardSyncFlags) 44 servenv.OnParseFor("vttablet", registerShardSyncFlags) 45 } 46 47 // shardSyncLoop is a loop that tries to keep the tablet state and the 48 // shard record in sync. 49 // 50 // It is launched as a background goroutine in the tablet because it may need to 51 // initiate a tablet state change in response to an incoming watch event for the 52 // shard record, and it may need to continually retry updating the shard record 53 // if it's out of sync with the tablet state. At steady state, when the tablet 54 // and shard record are in sync, this goroutine goes to sleep waiting for 55 // something to change in either the tablet state or in the shard record. 56 // 57 // This goroutine gets woken up for shard record changes by maintaining a 58 // topo watch on the shard record. It gets woken up for tablet state changes by 59 // a notification signal from setTablet(). 60 func (tm *TabletManager) shardSyncLoop(ctx context.Context, notifyChan <-chan struct{}, doneChan chan<- struct{}) { 61 defer close(doneChan) 62 63 // retryChan is how we wake up after going to sleep between retries. 64 // If no retry is pending, this channel will be nil, which means it's fine 65 // to always select on it -- a nil channel is never ready. 66 var retryChan <-chan time.Time 67 68 // shardWatch is how we get notified when the shard record is updated. 69 // We only watch the shard record while we are primary. 70 shardWatch := &shardWatcher{} 71 defer shardWatch.stop() 72 73 // This loop sleeps until it's notified that something may have changed. 74 // Then it wakes up to check if anything needs to be synchronized. 75 for { 76 select { 77 case <-notifyChan: 78 // Something may have changed in the tablet state. 79 log.Info("Change to tablet state") 80 case <-retryChan: 81 // It's time to retry a previous failed sync attempt. 82 log.Info("Retry sync") 83 case event := <-shardWatch.watchChan: 84 // Something may have changed in the shard record. 85 // We don't use the watch event except to know that we should 86 // re-read the shard record, and to know if the watch dies. 87 log.Info("Change in shard record") 88 if event.Err != nil { 89 // The watch failed. Stop it so we start a new one if needed. 90 log.Errorf("Shard watch failed: %v", event.Err) 91 shardWatch.stop() 92 } 93 case <-ctx.Done(): 94 // Our context was cancelled. Terminate the loop. 95 return 96 } 97 98 // Disconnect any pending retry timer since we're already retrying for 99 // another reason. 100 retryChan = nil 101 102 // Get the latest internal tablet value, representing what we think we are. 103 tablet := tm.Tablet() 104 105 switch tablet.Type { 106 case topodatapb.TabletType_PRIMARY: 107 // This is a failsafe code because we've seen races that can cause 108 // primary term start time to become zero. 109 if tablet.PrimaryTermStartTime == nil { 110 log.Errorf("PrimaryTermStartTime should not be nil: %v", tablet) 111 // Start retry timer and go back to sleep. 112 retryChan = time.After(shardSyncRetryDelay) 113 continue 114 } 115 // If we think we're primary, check if we need to update the shard record. 116 // Fetch the start time from the record we just got, because the tm's tablet can change. 117 primaryAlias, shouldDemote, err := syncShardPrimary(ctx, tm.TopoServer, tablet, logutil.ProtoToTime(tablet.PrimaryTermStartTime)) 118 if err != nil { 119 log.Errorf("Failed to sync shard record: %v", err) 120 // Start retry timer and go back to sleep. 121 retryChan = time.After(shardSyncRetryDelay) 122 continue 123 } 124 if shouldDemote { 125 // Someone updated the PrimaryTermStartTime while we still think we're primary. 126 // This means that we should end our term, since someone else must have claimed primaryship 127 // and wrote to the shard record 128 if err := tm.endPrimaryTerm(ctx, primaryAlias); err != nil { 129 log.Errorf("Failed to end primary term: %v", err) 130 // Start retry timer and go back to sleep. 131 retryChan = time.After(shardSyncRetryDelay) 132 continue 133 } 134 // We're not primary anymore, so stop watching the shard record. 135 shardWatch.stop() 136 continue 137 } 138 139 // As long as we're primary, watch the shard record so we'll be 140 // notified if another primary takes over. 141 if shardWatch.active() { 142 // We already have an active watch. Nothing to do. 143 continue 144 } 145 if err := shardWatch.start(tm.TopoServer, tablet.Keyspace, tablet.Shard); err != nil { 146 log.Errorf("Failed to start shard watch: %v", err) 147 // Start retry timer and go back to sleep. 148 retryChan = time.After(shardSyncRetryDelay) 149 continue 150 } 151 default: 152 // If we're not primary, stop watching the shard record, 153 // so only primaries contribute to global topo watch load. 154 shardWatch.stop() 155 } 156 } 157 } 158 159 // syncShardPrimary is called when we think we're primary. 160 // It checks that the shard record agrees, and updates it if possible. 161 // 162 // If the returned error is nil, the returned primaryAlias indicates the current 163 // primary tablet according to the shard record. 164 // 165 // If the shard record indicates a new primary has taken over, this returns 166 // success (we successfully synchronized), but the returned primaryAlias will be 167 // different from the input tablet.Alias. 168 func syncShardPrimary(ctx context.Context, ts *topo.Server, tablet *topodatapb.Tablet, PrimaryTermStartTime time.Time) (primaryAlias *topodatapb.TabletAlias, shouldDemote bool, err error) { 169 ctx, cancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout) 170 defer cancel() 171 172 var shardInfo *topo.ShardInfo 173 shouldDemote = false 174 _, err = ts.UpdateShardFields(ctx, tablet.Keyspace, tablet.Shard, func(si *topo.ShardInfo) error { 175 lastTerm := si.GetPrimaryTermStartTime() 176 177 // Save the ShardInfo so we can check it afterward. 178 // We can't use the return value of UpdateShardFields because it might be nil. 179 shardInfo = si 180 181 if PrimaryTermStartTime.Before(lastTerm) { 182 // In this case, we have outdated information, we will demote ourself 183 shouldDemote = true 184 } 185 186 // Only attempt an update if our term is more recent. 187 if !PrimaryTermStartTime.After(lastTerm) { 188 return topo.NewError(topo.NoUpdateNeeded, si.ShardName()) 189 } 190 191 aliasStr := topoproto.TabletAliasString(tablet.Alias) 192 log.Infof("Updating shard record: primary_alias=%v, primary_term_start_time=%v", aliasStr, PrimaryTermStartTime) 193 si.PrimaryAlias = tablet.Alias 194 si.PrimaryTermStartTime = logutil.TimeToProto(PrimaryTermStartTime) 195 return nil 196 }) 197 if err != nil { 198 return nil, shouldDemote, err 199 } 200 201 return shardInfo.PrimaryAlias, shouldDemote, nil 202 } 203 204 // endPrimaryTerm is called when we unexpectedly lost primaryship. 205 // 206 // Under normal circumstances, we should be gracefully demoted before a new 207 // primary appears. This function is only reached when that graceful demotion 208 // failed or was skipped, so we only found out we're no longer primary after the 209 // new primary started advertising itself. 210 // 211 // If active reparents are enabled, we demote our own MySQL to a replica and 212 // update our tablet type to REPLICA. 213 // 214 // If active reparents are disabled, we don't touch our MySQL. 215 // We just directly update our tablet type to REPLICA. 216 func (tm *TabletManager) endPrimaryTerm(ctx context.Context, primaryAlias *topodatapb.TabletAlias) error { 217 primaryAliasStr := topoproto.TabletAliasString(primaryAlias) 218 log.Warningf("Another tablet (%v) has won primary election. Stepping down to %v.", primaryAliasStr, tm.baseTabletType) 219 220 if mysqlctl.DisableActiveReparents { 221 // Don't touch anything at the MySQL level. Just update tablet state. 222 log.Infof("Active reparents are disabled; updating tablet state only.") 223 changeTypeCtx, cancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout) 224 defer cancel() 225 if err := tm.tmState.ChangeTabletType(changeTypeCtx, tm.baseTabletType, DBActionNone); err != nil { 226 return vterrors.Wrapf(err, "failed to change type to %v", tm.baseTabletType) 227 } 228 return nil 229 } 230 231 // Do a full demotion to convert MySQL into a replica. 232 // We do not revert on partial failure here because this code path only 233 // triggers after a new primary has taken over, so we are past the point of 234 // no return. Instead, we should leave partial results and retry the rest 235 // later. 236 log.Infof("Active reparents are enabled; converting MySQL to replica.") 237 demotePrimaryCtx, cancelDemotePrimary := context.WithTimeout(ctx, topo.RemoteOperationTimeout) 238 defer cancelDemotePrimary() 239 if _, err := tm.demotePrimary(demotePrimaryCtx, false /* revertPartialFailure */); err != nil { 240 return vterrors.Wrap(err, "failed to demote primary") 241 } 242 setPrimaryCtx, cancelSetPrimary := context.WithTimeout(ctx, topo.RemoteOperationTimeout) 243 defer cancelSetPrimary() 244 log.Infof("Attempting to reparent self to new primary %v.", primaryAliasStr) 245 if primaryAlias == nil { 246 if err := tm.tmState.ChangeTabletType(ctx, topodatapb.TabletType_REPLICA, DBActionNone); err != nil { 247 return err 248 } 249 } else { 250 if err := tm.setReplicationSourceSemiSyncNoAction(setPrimaryCtx, primaryAlias, 0, "", true); err != nil { 251 return vterrors.Wrap(err, "failed to reparent self to new primary") 252 } 253 } 254 return nil 255 } 256 257 func (tm *TabletManager) startShardSync() { 258 // Use a buffer size of 1 so we can remember we need to check the state 259 // even if the receiver is busy. We can drop any additional send attempts 260 // if the buffer is full because all we care about is that the receiver will 261 // be told it needs to recheck the state. 262 tm.mutex.Lock() 263 defer tm.mutex.Unlock() 264 tm._shardSyncChan = make(chan struct{}, 1) 265 tm._shardSyncDone = make(chan struct{}) 266 ctx, cancel := context.WithCancel(context.Background()) 267 tm._shardSyncCancel = cancel 268 269 // Start the sync loop in the background. 270 go tm.shardSyncLoop(ctx, tm._shardSyncChan, tm._shardSyncDone) 271 272 // Queue up a pending notification to force the loop to run once at startup. 273 go tm.notifyShardSync() 274 } 275 276 func (tm *TabletManager) stopShardSync() { 277 var doneChan <-chan struct{} 278 279 tm.mutex.Lock() 280 if tm._shardSyncCancel != nil { 281 tm._shardSyncCancel() 282 } 283 doneChan = tm._shardSyncDone 284 tm.mutex.Unlock() 285 286 // If the shard sync loop was running, wait for it to fully stop. 287 if doneChan != nil { 288 <-doneChan 289 } 290 } 291 292 func (tm *TabletManager) notifyShardSync() { 293 // If this is called before the shard sync is started, do nothing. 294 tm.mutex.Lock() 295 defer tm.mutex.Unlock() 296 297 if tm._shardSyncChan == nil { 298 return 299 } 300 301 // Try to send. If the channel buffer is full, it means a notification is 302 // already pending, so we don't need to do anything. 303 select { 304 case tm._shardSyncChan <- struct{}{}: 305 default: 306 } 307 }