vitess.io/vitess@v0.16.2/go/vt/vtgate/buffer/shard_buffer.go (about)

     1  /*
     2  Copyright 2019 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package buffer
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"runtime/debug"
    23  	"sync"
    24  	"time"
    25  
    26  	"vitess.io/vitess/go/vt/vtgate/errorsanitizer"
    27  
    28  	"vitess.io/vitess/go/vt/log"
    29  	"vitess.io/vitess/go/vt/logutil"
    30  	topodatapb "vitess.io/vitess/go/vt/proto/topodata"
    31  	"vitess.io/vitess/go/vt/topo/topoproto"
    32  	"vitess.io/vitess/go/vt/vterrors"
    33  )
    34  
    35  // bufferState represents the different states a shardBuffer object can be in.
    36  type bufferState string
    37  
    38  const (
    39  	// stateIdle means no failover is currently in progress.
    40  	stateIdle bufferState = "IDLE"
    41  	// stateBuffering is the phase when a failover is in progress.
    42  	stateBuffering bufferState = "BUFFERING"
    43  	// stateDraining is the phase when a failover ended and the queue is drained.
    44  	stateDraining bufferState = "DRAINING"
    45  )
    46  
    47  // entry is created per buffered request.
    48  type entry struct {
    49  	// done will be closed by shardBuffer when the failover is over and the
    50  	// request can be retried.
    51  	// Any Go routine closing this channel must also remove the entry from the
    52  	// ShardBuffer queue such that nobody else tries to close it.
    53  	done chan struct{}
    54  
    55  	// deadline is the time when the entry is out of the buffering window and it
    56  	// must be canceled.
    57  	deadline time.Time
    58  
    59  	// err is set if the buffering failed e.g. when the entry was evicted.
    60  	err error
    61  
    62  	// bufferCtx wraps the request ctx and is used to track the retry of a
    63  	// request during the drain phase. Once the retry is done, the caller
    64  	// must cancel this context (by calling bufferCancel).
    65  	bufferCtx    context.Context
    66  	bufferCancel func()
    67  }
    68  
    69  // shardBuffer buffers requests during a failover for a particular shard.
    70  // The object will be reused across failovers. If no failover is currently in
    71  // progress, the state is "IDLE".
    72  //
    73  // Note that this object is accessed concurrently by multiple threads:
    74  // - vtgate request threads
    75  // - discovery.HealthCheck listener execution thread
    76  // - timeout thread (timeout_thread.go) to evict too old buffered requests
    77  // - drain() thread
    78  type shardBuffer struct {
    79  	// Immutable fields set at construction.
    80  	buf      *Buffer
    81  	mode     bufferMode
    82  	keyspace string
    83  	shard    string
    84  
    85  	// statsKey is used to update the stats variables.
    86  	statsKey []string
    87  	// statsKeyJoined is all elements of "statsKey" in one string, joined by ".".
    88  	statsKeyJoined string
    89  	logTooRecent   *logutil.ThrottledLogger
    90  
    91  	// mu guards the fields below.
    92  	mu    sync.RWMutex
    93  	state bufferState
    94  	// queue is the list of buffered requests (ordered by arrival).
    95  	queue []*entry
    96  	// externallyReparented is the maximum value of all seen
    97  	// "StreamHealthResponse.TabletexternallyReparentedTimestamp" values across
    98  	// all PRIMARY tablets of this shard.
    99  	// In practice, it is a) the last time the shard was reparented or b) the last
   100  	// time the TabletExternallyReparented RPC was called on the tablet to confirm
   101  	// that the tablet is the current PRIMARY.
   102  	// We assume the value is a Unix timestamp in seconds.
   103  	externallyReparented int64
   104  	// lastStart is the last time we saw the start of a failover.
   105  	lastStart time.Time
   106  	// lastEnd is the last time we saw the end of a failover.
   107  	lastEnd time.Time
   108  	// lastReparent is the last time we saw that the tablet alias of the PRIMARY
   109  	// changed i.e. we definitely reparented to a different tablet.
   110  	lastReparent time.Time
   111  	// currentPrimary is tracked to determine when to update "lastReparent".
   112  	currentPrimary *topodatapb.TabletAlias
   113  	// timeoutThread will be set while a failover is in progress and the object is
   114  	// in the BUFFERING state.
   115  	timeoutThread *timeoutThread
   116  	// wg tracks all pending Go routines. waitForShutdown() will use this field to
   117  	// block on them.
   118  	wg sync.WaitGroup
   119  }
   120  
   121  func newShardBufferHealthCheck(buf *Buffer, mode bufferMode, keyspace, shard string) *shardBuffer {
   122  	statsKey := []string{keyspace, shard}
   123  	initVariablesForShard(statsKey)
   124  
   125  	return &shardBuffer{
   126  		buf:            buf,
   127  		mode:           mode,
   128  		keyspace:       keyspace,
   129  		shard:          shard,
   130  		statsKey:       statsKey,
   131  		statsKeyJoined: fmt.Sprintf("%s.%s", keyspace, shard),
   132  		logTooRecent:   logutil.NewThrottledLogger(fmt.Sprintf("FailoverTooRecent-%v", topoproto.KeyspaceShardString(keyspace, shard)), 5*time.Second),
   133  		state:          stateIdle,
   134  	}
   135  }
   136  
   137  func (sb *shardBuffer) timeNow() time.Time {
   138  	return sb.buf.config.now()
   139  }
   140  
   141  // disabled returns true if neither buffering nor the dry-run mode is enabled.
   142  func (sb *shardBuffer) disabled() bool {
   143  	return sb.mode == bufferModeDisabled
   144  }
   145  
   146  func (sb *shardBuffer) waitForFailoverEnd(ctx context.Context, keyspace, shard string, err error) (RetryDoneFunc, error) {
   147  	// We assume if err != nil then it's always caused by a failover.
   148  	// Other errors must be filtered at higher layers.
   149  	failoverDetected := err != nil
   150  
   151  	// Fast path (read lock): Check if we should NOT buffer a request.
   152  	sb.mu.RLock()
   153  	if !sb.shouldBufferLocked(failoverDetected) {
   154  		// No buffering required. Return early.
   155  		sb.mu.RUnlock()
   156  		return nil, nil
   157  	}
   158  	sb.mu.RUnlock()
   159  
   160  	// Buffering required. Acquire write lock.
   161  	sb.mu.Lock()
   162  	// Re-check state because it could have changed in the meantime.
   163  	if !sb.shouldBufferLocked(failoverDetected) {
   164  		// Buffering no longer required. Return early.
   165  		sb.mu.Unlock()
   166  		return nil, nil
   167  	}
   168  
   169  	// Start buffering if failover is not detected yet.
   170  	if sb.state == stateIdle {
   171  		// Do not buffer if last failover is too recent. This is the case if:
   172  		// a) buffering was stopped recently
   173  		// OR
   174  		// b) we did not buffer, but observed a reparent very recently
   175  		now := sb.timeNow()
   176  		minTimeBetweenFailovers := sb.buf.config.MinTimeBetweenFailovers
   177  
   178  		// a) Buffering was stopped recently.
   179  		// This can happen when we stop buffering while MySQL is not ready yet
   180  		// (read-only mode is not cleared yet on the new primary).
   181  		lastBufferingStopped := now.Sub(sb.lastEnd)
   182  		if !sb.lastEnd.IsZero() && lastBufferingStopped < minTimeBetweenFailovers {
   183  			sb.mu.Unlock()
   184  			msg := "NOT starting buffering"
   185  			if sb.mode == bufferModeDryRun {
   186  				msg = "Dry-run: Would NOT have started buffering"
   187  			}
   188  
   189  			sb.logTooRecent.Infof("%v for shard: %s because the last failover which triggered buffering is too recent (%v < %v)."+
   190  				" (A failover was detected by this seen error: %v.)",
   191  				msg, topoproto.KeyspaceShardString(keyspace, shard), lastBufferingStopped, minTimeBetweenFailovers, err)
   192  
   193  			statsKeyWithReason := append(sb.statsKey, string(skippedLastFailoverTooRecent))
   194  			requestsSkipped.Add(statsKeyWithReason, 1)
   195  			return nil, nil
   196  		}
   197  
   198  		// b) The PRIMARY was reparented recently (but we did not buffer it.)
   199  		// This can happen when we see the end of the reparent *before* the first
   200  		// request failure caused by the reparent. This is possible if the QPS is
   201  		// very low. If we do not skip buffering here, we would start buffering but
   202  		// not stop because we already observed the promotion of the new primary.
   203  		lastReparentAgo := now.Sub(sb.lastReparent)
   204  		if !sb.lastReparent.IsZero() && lastReparentAgo < minTimeBetweenFailovers {
   205  			sb.mu.Unlock()
   206  			msg := "NOT starting buffering"
   207  			if sb.mode == bufferModeDryRun {
   208  				msg = "Dry-run: Would NOT have started buffering"
   209  			}
   210  
   211  			sb.logTooRecent.Infof("%v for shard: %s because the last reparent is too recent (%v < %v)."+
   212  				" (A failover was detected by this seen error: %v.)",
   213  				msg, topoproto.KeyspaceShardString(keyspace, shard), lastReparentAgo, minTimeBetweenFailovers, err)
   214  
   215  			statsKeyWithReason := append(sb.statsKey, string(skippedLastReparentTooRecent))
   216  			requestsSkipped.Add(statsKeyWithReason, 1)
   217  			return nil, nil
   218  		}
   219  
   220  		sb.startBufferingLocked(err)
   221  	}
   222  
   223  	if sb.mode == bufferModeDryRun {
   224  		sb.mu.Unlock()
   225  		// Dry-run. Do not actually buffer the request and return early.
   226  		lastRequestsDryRunMax.Add(sb.statsKey, 1)
   227  		requestsBufferedDryRun.Add(sb.statsKey, 1)
   228  		return nil, nil
   229  	}
   230  
   231  	// Buffer request.
   232  	entry, err := sb.bufferRequestLocked(ctx)
   233  	sb.mu.Unlock()
   234  	if err != nil {
   235  		return nil, err
   236  	}
   237  	return sb.wait(ctx, entry)
   238  }
   239  
   240  // shouldBufferLocked returns true if the current request should be buffered
   241  // (based on the current state and whether the request detected a failover).
   242  func (sb *shardBuffer) shouldBufferLocked(failoverDetected bool) bool {
   243  	switch s := sb.state; {
   244  	case s == stateIdle && !failoverDetected:
   245  		// No failover in progress.
   246  		return false
   247  	case s == stateIdle && failoverDetected:
   248  		// Not buffering yet, but new failover detected.
   249  		return true
   250  	case s == stateBuffering:
   251  		// Failover in progress.
   252  		return true
   253  	case s == stateDraining && !failoverDetected:
   254  		// Draining. Non-failover related requests can pass through.
   255  		return false
   256  	case s == stateDraining && failoverDetected:
   257  		// Possible race between request which saw failover-related error and the
   258  		// end of the failover. Do not buffer and let vtgate retry immediately.
   259  		return false
   260  	}
   261  	panic("BUG: All possible states must be covered by the switch expression above.")
   262  }
   263  
   264  func (sb *shardBuffer) startBufferingLocked(err error) {
   265  	// Reset monitoring data from previous failover.
   266  	lastRequestsInFlightMax.Set(sb.statsKey, 0)
   267  	lastRequestsDryRunMax.Set(sb.statsKey, 0)
   268  	failoverDurationSumMs.Reset(sb.statsKey)
   269  
   270  	sb.lastStart = sb.timeNow()
   271  	sb.logErrorIfStateNotLocked(stateIdle)
   272  	sb.state = stateBuffering
   273  	sb.queue = make([]*entry, 0)
   274  
   275  	sb.timeoutThread = newTimeoutThread(sb, sb.buf.config.MaxFailoverDuration)
   276  	sb.timeoutThread.start()
   277  	msg := "Starting buffering"
   278  	if sb.mode == bufferModeDryRun {
   279  		msg = "Dry-run: Would have started buffering"
   280  	}
   281  	starts.Add(sb.statsKey, 1)
   282  	log.Infof("%v for shard: %s (window: %v, size: %v, max failover duration: %v) (A failover was detected by this seen error: %v.)",
   283  		msg,
   284  		topoproto.KeyspaceShardString(sb.keyspace, sb.shard),
   285  		sb.buf.config.Window,
   286  		sb.buf.config.Size,
   287  		sb.buf.config.MaxFailoverDuration,
   288  		errorsanitizer.NormalizeError(err.Error()),
   289  	)
   290  }
   291  
   292  // logErrorIfStateNotLocked logs an error if the current state is not "state".
   293  // We do not panic/crash the process here because it is expected that a wrong
   294  // state is less severe than (potentially) crash-looping all vtgates.
   295  // Note: The prefix "Locked" is not related to the state. Instead, it stresses
   296  // that "sb.mu" must be locked before calling the method.
   297  func (sb *shardBuffer) logErrorIfStateNotLocked(state bufferState) {
   298  	if sb.state != state {
   299  		log.Errorf("BUG: Buffer state should be '%v' and not '%v'. Full state of buffer object: %#v Stacktrace:\n%s", state, sb.state, sb, debug.Stack())
   300  	}
   301  }
   302  
   303  // bufferRequest creates a new entry in the queue for a request which
   304  // should be buffered.
   305  // It returns *entry which can be used as input for shardBuffer.cancel(). This
   306  // is useful for canceled RPCs (e.g. due to deadline exceeded) which want to
   307  // give up their spot in the buffer. It also holds the "bufferCancel" function.
   308  // If buffering fails e.g. due to a full buffer, an error is returned.
   309  func (sb *shardBuffer) bufferRequestLocked(ctx context.Context) (*entry, error) {
   310  	if !sb.buf.bufferSizeSema.TryAcquire() {
   311  		// Buffer is full. Evict the oldest entry and buffer this request instead.
   312  		if len(sb.queue) == 0 {
   313  			// Overall buffer is full, but this shard's queue is empty. That means
   314  			// there is at least one other shard failing over as well which consumes
   315  			// the whole buffer.
   316  			statsKeyWithReason := append(sb.statsKey, string(skippedBufferFull))
   317  			requestsSkipped.Add(statsKeyWithReason, 1)
   318  			return nil, bufferFullError
   319  		}
   320  
   321  		e := sb.queue[0]
   322  		// Evict the entry. Do not release its slot in the buffer and reuse it for
   323  		// this new request.
   324  		// NOTE: We keep the lock to avoid racing with drain().
   325  		// NOTE: We're not waiting until the request finishes and instead reuse its
   326  		// slot immediately, i.e. the number of evicted requests + drained requests
   327  		// can be bigger than the buffer size.
   328  		sb.unblockAndWait(e, entryEvictedError, false /* releaseSlot */, false /* blockingWait */)
   329  		sb.queue = sb.queue[1:]
   330  		statsKeyWithReason := append(sb.statsKey, evictedBufferFull)
   331  		requestsEvicted.Add(statsKeyWithReason, 1)
   332  	}
   333  
   334  	e := &entry{
   335  		done:     make(chan struct{}),
   336  		deadline: sb.timeNow().Add(sb.buf.config.Window),
   337  	}
   338  	e.bufferCtx, e.bufferCancel = context.WithCancel(ctx)
   339  	sb.queue = append(sb.queue, e)
   340  
   341  	if max := lastRequestsInFlightMax.Counts()[sb.statsKeyJoined]; max < int64(len(sb.queue)) {
   342  		lastRequestsInFlightMax.Set(sb.statsKey, int64(len(sb.queue)))
   343  	}
   344  	requestsBuffered.Add(sb.statsKey, 1)
   345  
   346  	if len(sb.queue) == 1 {
   347  		sb.timeoutThread.notifyQueueNotEmpty()
   348  	}
   349  	return e, nil
   350  }
   351  
   352  // unblockAndWait unblocks a blocked request.
   353  // If releaseSlot is true, the buffer semaphore will be decreased by 1 when
   354  // the request retried and finished.
   355  // If blockingWait is true, this call will block until the request retried and
   356  // finished. This mode is used during the drain (to avoid flooding the primary)
   357  // while the non-blocking mode is used when a) evicting a request (e.g. because
   358  // the buffer is full or it exceeded the buffering window) or b) when the
   359  // request was canceled from outside and we removed it.
   360  func (sb *shardBuffer) unblockAndWait(e *entry, err error, releaseSlot, blockingWait bool) {
   361  	// Set error such that the request will see it.
   362  	e.err = err
   363  	// Tell blocked request to stop waiting.
   364  	close(e.done)
   365  
   366  	if blockingWait {
   367  		sb.waitForRequestFinish(e, releaseSlot, false /* async */)
   368  	} else {
   369  		sb.wg.Add(1)
   370  		go sb.waitForRequestFinish(e, releaseSlot, true /* async */)
   371  	}
   372  }
   373  
   374  func (sb *shardBuffer) waitForRequestFinish(e *entry, releaseSlot, async bool) {
   375  	if async {
   376  		defer sb.wg.Done()
   377  	}
   378  
   379  	// Wait for unblocked request to finish.
   380  	<-e.bufferCtx.Done()
   381  
   382  	// Release the slot to the buffer.
   383  	// NOTE: We always wait for the request first, even if the calling code like
   384  	// the buffer full eviction or the timeout thread does not block on us.
   385  	// This way, the request's slot can only be reused after the request finished.
   386  	if releaseSlot {
   387  		sb.buf.bufferSizeSema.Release()
   388  	}
   389  }
   390  
   391  // wait blocks while the request is buffered during the failover.
   392  // See Buffer.WaitForFailoverEnd() for the API contract of the return values.
   393  func (sb *shardBuffer) wait(ctx context.Context, e *entry) (RetryDoneFunc, error) {
   394  	select {
   395  	case <-ctx.Done():
   396  		sb.remove(e)
   397  		return nil, vterrors.Errorf(vterrors.Code(contextCanceledError), "%v: %v", contextCanceledError, ctx.Err())
   398  	case <-e.done:
   399  		return e.bufferCancel, e.err
   400  	}
   401  }
   402  
   403  // oldestEntry returns the head of the queue or nil if the queue is empty.
   404  func (sb *shardBuffer) oldestEntry() *entry {
   405  	sb.mu.Lock()
   406  	defer sb.mu.Unlock()
   407  
   408  	if len(sb.queue) > 0 {
   409  		return sb.queue[0]
   410  	}
   411  	return nil
   412  }
   413  
   414  // evictOldestEntry is used by timeoutThread to evict the head entry of the
   415  // queue if it exceeded its buffering window.
   416  func (sb *shardBuffer) evictOldestEntry(e *entry) {
   417  	sb.mu.Lock()
   418  	defer sb.mu.Unlock()
   419  
   420  	if len(sb.queue) == 0 || e != sb.queue[0] {
   421  		// Entry is already removed e.g. by remove(). Ignore it.
   422  		return
   423  	}
   424  
   425  	// Evict the entry.
   426  	//
   427  	// NOTE: We're not waiting for the request to finish in order to unblock the
   428  	// timeout thread as fast as possible. However, the slot of the evicted
   429  	// request is only returned after it has finished i.e. the buffer may stay
   430  	// full in the meantime. This is a design tradeoff to keep things simple and
   431  	// avoid additional pressure on the primary tablet.
   432  	sb.unblockAndWait(e, nil /* err */, true /* releaseSlot */, false /* blockingWait */)
   433  	sb.queue = sb.queue[1:]
   434  	statsKeyWithReason := append(sb.statsKey, evictedWindowExceeded)
   435  	requestsEvicted.Add(statsKeyWithReason, 1)
   436  }
   437  
   438  // remove must be called when the request was canceled from outside and not
   439  // internally.
   440  func (sb *shardBuffer) remove(toRemove *entry) {
   441  	sb.mu.Lock()
   442  	defer sb.mu.Unlock()
   443  
   444  	if sb.queue == nil {
   445  		// Queue is cleared because we're already in the DRAIN phase.
   446  		return
   447  	}
   448  
   449  	// If entry is still in the queue, delete it and cancel it internally.
   450  	for i, e := range sb.queue {
   451  		if e == toRemove {
   452  			// Delete entry at index "i" from slice.
   453  			sb.queue = append(sb.queue[:i], sb.queue[i+1:]...)
   454  
   455  			// Cancel the entry's "bufferCtx".
   456  			// The usual drain or eviction code would unblock the request and then
   457  			// wait for the "bufferCtx" to be done.
   458  			// But this code path is different because it's going to return an error
   459  			// to the request and not the "e.bufferCancel" function i.e. the request
   460  			// cannot cancel the "bufferCtx" itself.
   461  			// Therefore, we call "e.bufferCancel". This also avoids that the
   462  			// context's Go routine could leak.
   463  			e.bufferCancel()
   464  			// Release the buffer slot and close the "e.done" channel.
   465  			// By closing "e.done", we finish it explicitly and timeoutThread will
   466  			// find out about it as well.
   467  			sb.unblockAndWait(e, nil /* err */, true /* releaseSlot */, false /* blockingWait */)
   468  
   469  			// Track it as "ContextDone" eviction.
   470  			statsKeyWithReason := append(sb.statsKey, string(evictedContextDone))
   471  			requestsEvicted.Add(statsKeyWithReason, 1)
   472  			return
   473  		}
   474  	}
   475  
   476  	// Entry was already removed. Keep the queue as it is.
   477  }
   478  
   479  func (sb *shardBuffer) recordKeyspaceEvent(alias *topodatapb.TabletAlias, stillServing bool) {
   480  	sb.mu.Lock()
   481  	defer sb.mu.Unlock()
   482  
   483  	log.Infof("disruption in shard %s/%s resolved (serving: %v)", sb.keyspace, sb.shard, stillServing)
   484  
   485  	if !topoproto.TabletAliasEqual(alias, sb.currentPrimary) {
   486  		if sb.currentPrimary != nil {
   487  			sb.lastReparent = sb.timeNow()
   488  		}
   489  		sb.currentPrimary = alias
   490  	}
   491  	if stillServing {
   492  		sb.stopBufferingLocked(stopFailoverEndDetected, "a primary promotion has been detected")
   493  	} else {
   494  		sb.stopBufferingLocked(stopShardMissing, "the keyspace has been resharded")
   495  	}
   496  }
   497  
   498  func (sb *shardBuffer) recordExternallyReparentedTimestamp(timestamp int64, alias *topodatapb.TabletAlias) {
   499  	// Fast path (read lock): Check if new timestamp is higher.
   500  	sb.mu.RLock()
   501  	if timestamp <= sb.externallyReparented {
   502  		// Do nothing. Equal values are reported if the primary has not changed.
   503  		// Smaller values can be reported during the failover by the old primary
   504  		// after the new primary already took over.
   505  		sb.mu.RUnlock()
   506  		return
   507  	}
   508  	sb.mu.RUnlock()
   509  
   510  	// New timestamp is higher. Stop buffering if running.
   511  	sb.mu.Lock()
   512  	defer sb.mu.Unlock()
   513  
   514  	// Re-check value after acquiring write lock.
   515  	if timestamp <= sb.externallyReparented {
   516  		return
   517  	}
   518  
   519  	sb.externallyReparented = timestamp
   520  	if !topoproto.TabletAliasEqual(alias, sb.currentPrimary) {
   521  		if sb.currentPrimary != nil {
   522  			sb.lastReparent = sb.timeNow()
   523  		}
   524  		sb.currentPrimary = alias
   525  	}
   526  	sb.stopBufferingLocked(stopFailoverEndDetected, "failover end detected")
   527  }
   528  
   529  func (sb *shardBuffer) stopBufferingDueToMaxDuration() {
   530  	sb.mu.Lock()
   531  	defer sb.mu.Unlock()
   532  
   533  	sb.stopBufferingLocked(stopMaxFailoverDurationExceeded,
   534  		fmt.Sprintf("stopping buffering because failover did not finish in time (%v)", sb.buf.config.MaxFailoverDuration))
   535  }
   536  
   537  func (sb *shardBuffer) stopBufferingLocked(reason stopReason, details string) {
   538  	if sb.state != stateBuffering {
   539  		return
   540  	}
   541  
   542  	// Stop buffering.
   543  	sb.lastEnd = sb.timeNow()
   544  	d := sb.lastEnd.Sub(sb.lastStart)
   545  
   546  	statsKeyWithReason := append(sb.statsKey, string(reason))
   547  	stops.Add(statsKeyWithReason, 1)
   548  
   549  	lastFailoverDurationMs.Set(sb.statsKey, int64(d/time.Millisecond))
   550  	failoverDurationSumMs.Add(sb.statsKey, int64(d/time.Millisecond))
   551  	if sb.mode == bufferModeDryRun {
   552  		utilDryRunMax := int64(
   553  			float64(lastRequestsDryRunMax.Counts()[sb.statsKeyJoined]) / float64(sb.buf.config.Size) * 100.0)
   554  		utilizationDryRunSum.Add(sb.statsKey, utilDryRunMax)
   555  	} else {
   556  		utilMax := int64(
   557  			float64(lastRequestsInFlightMax.Counts()[sb.statsKeyJoined]) / float64(sb.buf.config.Size) * 100.0)
   558  		utilizationSum.Add(sb.statsKey, utilMax)
   559  	}
   560  
   561  	sb.logErrorIfStateNotLocked(stateBuffering)
   562  	sb.state = stateDraining
   563  	q := sb.queue
   564  	// Clear the queue such that remove(), oldestEntry() and evictOldestEntry()
   565  	// will not work on obsolete data.
   566  	sb.queue = nil
   567  
   568  	msg := "Stopping buffering"
   569  	if sb.mode == bufferModeDryRun {
   570  		msg = "Dry-run: Would have stopped buffering"
   571  	}
   572  	log.Infof("%v for shard: %s after: %.1f seconds due to: %v. Draining %d buffered requests now.", msg, topoproto.KeyspaceShardString(sb.keyspace, sb.shard), d.Seconds(), details, len(q))
   573  
   574  	var clientEntryError error
   575  	if reason == stopShardMissing {
   576  		clientEntryError = ShardMissingError
   577  	}
   578  
   579  	// Start the drain. (Use a new Go routine to release the lock.)
   580  	sb.wg.Add(1)
   581  	go sb.drain(q, clientEntryError)
   582  }
   583  
   584  func (sb *shardBuffer) drain(q []*entry, err error) {
   585  	defer sb.wg.Done()
   586  
   587  	// stop must be called outside of the lock because the thread may access
   588  	// shardBuffer as well e.g. to get the current oldest entry.
   589  	sb.timeoutThread.stop()
   590  
   591  	start := sb.timeNow()
   592  	// TODO(mberlin): Parallelize the drain by pumping the data through a channel.
   593  	for _, e := range q {
   594  		sb.unblockAndWait(e, err, true /* releaseSlot */, true /* blockingWait */)
   595  	}
   596  	d := sb.timeNow().Sub(start)
   597  	log.Infof("Draining finished for shard: %s Took: %v for: %d requests.", topoproto.KeyspaceShardString(sb.keyspace, sb.shard), d, len(q))
   598  	requestsDrained.Add(sb.statsKey, int64(len(q)))
   599  
   600  	// Draining is done. Change state from "draining" to "idle".
   601  	sb.mu.Lock()
   602  	defer sb.mu.Unlock()
   603  	sb.logErrorIfStateNotLocked(stateDraining)
   604  	sb.state = stateIdle
   605  	sb.timeoutThread = nil
   606  }
   607  
   608  func (sb *shardBuffer) shutdown() {
   609  	sb.mu.Lock()
   610  	sb.stopBufferingLocked(stopShutdown, "shutdown")
   611  	sb.mu.Unlock()
   612  }
   613  
   614  func (sb *shardBuffer) waitForShutdown() {
   615  	sb.wg.Wait()
   616  }
   617  
   618  // sizeForTesting is used by the unit test only to find out the current number
   619  // of buffered requests.
   620  // TODO(mberlin): Remove this if we add a more general statistics reporting.
   621  func (sb *shardBuffer) testGetSize() int {
   622  	sb.mu.RLock()
   623  	defer sb.mu.RUnlock()
   624  	return len(sb.queue)
   625  }
   626  
   627  // stateForTesting is used by unit tests only to probe the current state.
   628  func (sb *shardBuffer) testGetState() bufferState {
   629  	sb.mu.RLock()
   630  	defer sb.mu.RUnlock()
   631  	return sb.state
   632  }