github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/closedts/minprop/tracker.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package minprop
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"sort"
    17  
    18  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts"
    19  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts/ctpb"
    20  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    21  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    22  	"github.com/cockroachdb/cockroach/pkg/util/log"
    23  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    24  )
    25  
    26  // Tracker implements TrackerI.
    27  type Tracker struct {
    28  	mu struct {
    29  		syncutil.Mutex
    30  		// closed is the most recently closed timestamp.
    31  		closed      hlc.Timestamp
    32  		closedEpoch ctpb.Epoch
    33  
    34  		// The variables below track required information for the next closed
    35  		// timestamp and beyond. First, `next` is the timestamp that will be
    36  		// closed out next (i.e. will replace `closed`).
    37  		//
    38  		// "left" and "right" refers to how the timestamps at which the
    39  		// associated command evaluations take place relate to `next`.
    40  		// `left`-tracked proposals are taken into account for the next closed
    41  		// timestamp, i.e. they could mutate at timestamps <= `next`. `right`
    42  		// proposals affect only MVCC timestamps > `next` and thus will become
    43  		// relevant only after `next` has been closed out, at which point the
    44  		// "right" set will replace the "left".
    45  		//
    46  		//    closed           next
    47  		//      |          left | right
    48  		//      |               |
    49  		//      |               |
    50  		//      v               v
    51  		//---------------------------------------------------------> time
    52  		//
    53  		// A replica wishing to serve a follower read will first have to catch
    54  		// up to a lease applied index that is guaranteed to include all writes
    55  		// affecting the closed timestamp or below. When `next` is closed out,
    56  		// the set of relevant Lease Applied Indexes will be stored in `leftMLAI`.
    57  		//
    58  		// This is augmented by reference counts for the proposals currently in
    59  		// the process of evaluating. `next` can only be closed out once
    60  		// `leftRef` has been drained (i.e. has dropped to zero); new proposals
    61  		// are always forced above `next` and consequently count towards
    62  		// `rightRef`.
    63  		//
    64  		// Epochs track the highest liveness epoch observed for any released
    65  		// proposals. Tracking a max epoch allows the MPT to provide some MLAI
    66  		// information about the current epoch when calls to Close straddle multiple
    67  		// different epochs. Before epoch tracking was added the client of the MPT
    68  		// was forced to assume that the MLAI information from the current call to
    69  		// Close corresponded to the highest known epoch as of the previous call to
    70  		// Close. This is problematic in cases where an epoch change leads to a
    71  		// lease change for an otherwise quiescent range. If this mechanism were
    72  		// not in place then the client would never learn about an MLAI for the
    73  		// current epoch. Clients provide their view of the current epoch to calls
    74  		// to Close which use this information to determine whether the current
    75  		// state should be moved and whether the caller can make use of the
    76  		// currently tracked data. Each side tracks data which corresponds exactly
    77  		// to the side's epoch value. Releasing a proposal into the tracker at a
    78  		// later epoch than is currently tracked will result in the current data
    79  		// corresponding to the prior epoch to be evicted.
    80  
    81  		next                  hlc.Timestamp
    82  		leftMLAI, rightMLAI   map[roachpb.RangeID]ctpb.LAI
    83  		leftRef, rightRef     int
    84  		leftEpoch, rightEpoch ctpb.Epoch
    85  	}
    86  }
    87  
    88  var _ closedts.TrackerI = (*Tracker)(nil)
    89  
    90  // NewTracker returns a Tracker initialized to a closed timestamp of zero and
    91  // a next closed timestamp of one logical tick past zero.
    92  func NewTracker() *Tracker {
    93  	t := &Tracker{}
    94  	const initialEpoch = 1
    95  	t.mu.closedEpoch = initialEpoch
    96  	t.mu.leftEpoch = initialEpoch
    97  	t.mu.rightEpoch = initialEpoch
    98  	t.mu.next = hlc.Timestamp{Logical: 1}
    99  	t.mu.leftMLAI = map[roachpb.RangeID]ctpb.LAI{}
   100  	t.mu.rightMLAI = map[roachpb.RangeID]ctpb.LAI{}
   101  	return t
   102  }
   103  
   104  // String prints a string representation of the Tracker's state.
   105  func (t *Tracker) String() string {
   106  	t.mu.Lock()
   107  	defer t.mu.Unlock()
   108  	closed, next := t.mu.closed, t.mu.next
   109  	leftRef, rightRef := t.mu.leftRef, t.mu.rightRef
   110  	leftEpoch, rightEpoch := t.mu.leftEpoch, t.mu.rightEpoch
   111  
   112  	type item struct {
   113  		rangeID roachpb.RangeID
   114  		mlai    ctpb.LAI
   115  		left    bool
   116  	}
   117  
   118  	var lais []item
   119  	for rangeID, mlai := range t.mu.leftMLAI {
   120  		lais = append(lais, item{rangeID, mlai, true})
   121  	}
   122  	for rangeID, mlai := range t.mu.rightMLAI {
   123  		lais = append(lais, item{rangeID, mlai, false})
   124  	}
   125  
   126  	sort.Slice(lais, func(i, j int) bool {
   127  		if lais[i].rangeID != lais[j].rangeID {
   128  			return lais[i].rangeID < lais[j].rangeID
   129  		}
   130  		return lais[i].mlai < lais[j].mlai
   131  	})
   132  
   133  	var lines string
   134  	for _, item := range lais {
   135  		var format string
   136  		if !item.left {
   137  			format = `      |               @ %-2d     (r%d)
   138  `
   139  		} else {
   140  			format = `      |   %11d @        (r%d)
   141  `
   142  		}
   143  		lines += fmt.Sprintf(format, item.mlai, item.rangeID)
   144  	}
   145  
   146  	return fmt.Sprintf(`
   147    closed=%s
   148        |            next=%s
   149        |          left | right
   150        |           %3d # %d
   151        |           %3d e %d
   152  `+lines+
   153  		`      v               v
   154  ---------------------------------------------------------> time
   155  `,
   156  		closed, next, leftRef, rightRef, leftEpoch, rightEpoch,
   157  	)
   158  }
   159  
   160  // Close attempts to close out the current candidate timestamp (replacing it
   161  // with the provided one). This is possible only if tracked proposals that were
   162  // evaluating when Close was previously called have since completed. On success,
   163  // all subsequent proposals will be forced to evaluate strictly above the
   164  // provided timestamp, and the timestamp previously passed to Close is returned
   165  // as a closed timestamp along with a map of minimum Lease Applied Indexes
   166  // reflecting the updates for the past period. On failure, the previous closed
   167  // timestamp is returned along with a nil map (which can be treated by callers
   168  // like a successful call that happens to not return any new information).
   169  // Similarly, failure to provide a timestamp strictly larger than that to be
   170  // closed out next results in the same "idempotent" return values.
   171  //
   172  // Callers additionally provide the current expected epoch value, the liveness
   173  // epoch at which the caller intends to advertise this closed timestamp. The
   174  // caller must know that it is live at a timestamp greater than or equal to the
   175  // timestamp which the tracker will close. For correctness purposes this will
   176  // be the case if the caller knows that it is live at next and calls to Close()
   177  // pass monontic calues for next. If the current expected epoch is older than
   178  // the currently tracked data then the timestamp will fail to be closed. If the
   179  // expected epoch value is older than the epoch tracked on the left but
   180  // corresponds to the epoch of the previous successful close then the previous
   181  // closed timestamp is returned along with a nil map. This situation is just
   182  // like the unsuccessful close scenario due to unreleased proposals. This
   183  // behavior enables the caller to successfully obtain the tracked data at the
   184  // newer epoch in a later query after its epoch has updated. If the caller's
   185  // expected epoch is even older than the previously returned epoch then zero
   186  // values are returned. If the caller's expected epoch is newer than that of
   187  // tracked data the state of the tracker is progressed but zero values are
   188  // returned.
   189  func (t *Tracker) Close(
   190  	next hlc.Timestamp, expCurEpoch ctpb.Epoch,
   191  ) (ts hlc.Timestamp, mlai map[roachpb.RangeID]ctpb.LAI, ok bool) {
   192  	t.mu.Lock()
   193  	defer t.mu.Unlock()
   194  
   195  	if log.V(3) {
   196  		log.Infof(context.TODO(),
   197  			"close: leftRef=%d (ep: %d) rightRef=%d (ep: %d) next=%s closed=%s@ (ep: %d) new=%s (ep: %d)",
   198  			t.mu.leftRef, t.mu.leftEpoch, t.mu.rightRef, t.mu.rightEpoch, t.mu.next,
   199  			t.mu.closed, t.mu.closedEpoch, next, expCurEpoch)
   200  	}
   201  
   202  	// Make sure to not let `t.mu.next` regress, or we'll accept proposals
   203  	// that violate earlier closed timestamps. (And if it stayed the same
   204  	// the logic in the closure returned from Track would fall apart).
   205  	canClose := t.mu.leftRef == 0 && t.mu.next.Less(next)
   206  
   207  	// NB: the expected closed epoch may not match the epoch for the timestamp we
   208  	// are currently closing. If the expected closed epoch is earlier than the
   209  	// epoch tracked on the left then the caller likely read its liveness just
   210  	// before an epoch change and we should not move the tracker state as the
   211  	// caller will likely visit again with the new epoch and would like the
   212  	// tracked information. If the expCurEpoch is greater than or equal to the
   213  	// current epoch, proceed with closing out the current timestamp, deferring
   214  	// the decision regarding whether to return the updated state based on epoch
   215  	// until after updating the data.
   216  	if canClose && t.mu.leftEpoch <= expCurEpoch {
   217  		// NB: if rightRef is also zero, then nothing is in flight right now and
   218  		// we could theoretically close out `next`. However, we'd also have to
   219  		// merge the left and right MLAI maps, and would force followers to
   220  		// catch up to more commands much more rapidly than can be expected of
   221  		// them. If we want to make use of this optimization, we should emit
   222  		// two closed timestamp updates for this case.
   223  		t.mu.closed = t.mu.next
   224  		t.mu.closedEpoch = t.mu.leftEpoch
   225  		mlai = t.mu.leftMLAI
   226  
   227  		// NB: if the expCurEpoch is after the epoch tracked on the right, we'll
   228  		// never be able to use that information so clear it. The below logic is
   229  		// not required for correctness but adds an invariant that after a call to
   230  		// Close with a give expCurEpoch no state corresponding to an earlier epoch
   231  		// will be tracked on either side. Without this logic, subsequent proposals
   232  		// or Close calls at the later epoch would lead to this data being
   233  		// discarded at that point.
   234  		if t.mu.rightEpoch < expCurEpoch {
   235  			t.mu.rightEpoch = expCurEpoch
   236  			clearMLAIMap(t.mu.rightMLAI)
   237  		}
   238  
   239  		// `next` moves forward to the provided timestamp, and picks up the
   240  		// right refcount and MLAIs (so that it is now responsible for tracking
   241  		// everything that's in-flight).
   242  		t.mu.leftMLAI = t.mu.rightMLAI
   243  		t.mu.leftRef = t.mu.rightRef
   244  		t.mu.leftEpoch = t.mu.rightEpoch
   245  		t.mu.rightMLAI = map[roachpb.RangeID]ctpb.LAI{}
   246  		t.mu.rightRef = 0
   247  
   248  		t.mu.next = next
   249  	}
   250  
   251  	if t.mu.closedEpoch != expCurEpoch {
   252  		return hlc.Timestamp{}, nil, false
   253  	}
   254  	return t.mu.closed, mlai, true
   255  }
   256  
   257  // Track is called before evaluating a proposal. It returns the minimum
   258  // timestamp at which the proposal can be evaluated (i.e. the request timestamp
   259  // needs to be forwarded if necessary), and acquires a reference with the
   260  // Tracker. This reference is released by calling the returned closure either
   261  // a) before proposing the command, supplying the Lease Applied Index at which
   262  //    the proposal will be carried out, or
   263  // b) with zero arguments if the command won't end up being proposed (i.e. hit
   264  //    an error during evaluation).
   265  //
   266  // The ReleaseFunc is not thread safe. For convenience, it may be called with
   267  // zero arguments once after a regular call.
   268  func (t *Tracker) Track(ctx context.Context) (hlc.Timestamp, closedts.ReleaseFunc) {
   269  	shouldLog := log.V(3)
   270  
   271  	t.mu.Lock()
   272  	minProp := t.mu.next.Next()
   273  	t.mu.rightRef++
   274  	t.mu.Unlock()
   275  
   276  	if shouldLog {
   277  		log.Infof(ctx, "track: proposal on the right at minProp %s", minProp)
   278  	}
   279  
   280  	var calls int
   281  	release := func(ctx context.Context, epoch ctpb.Epoch, rangeID roachpb.RangeID, lai ctpb.LAI) {
   282  		calls++
   283  		if calls != 1 {
   284  			if lai != 0 || rangeID != 0 || calls > 2 {
   285  				log.Fatalf(ctx, "command released %d times, this time with arguments (%d, %d)",
   286  					log.Safe(calls), log.Safe(rangeID), log.Safe(lai))
   287  			}
   288  			return
   289  		}
   290  		t.release(ctx, minProp, epoch, rangeID, lai, shouldLog)
   291  	}
   292  
   293  	return minProp, release
   294  }
   295  
   296  // release is the business logic to release properly account for the release of
   297  // a tracked proposal. It is called from the ReleaseFunc closure returned from
   298  // Track.
   299  func (t *Tracker) release(
   300  	ctx context.Context,
   301  	minProp hlc.Timestamp,
   302  	epoch ctpb.Epoch,
   303  	rangeID roachpb.RangeID,
   304  	lai ctpb.LAI,
   305  	shouldLog bool,
   306  ) {
   307  	t.mu.Lock()
   308  	defer t.mu.Unlock()
   309  	var left bool
   310  	if minProp == t.mu.closed.Next() {
   311  		left = true
   312  	} else if minProp == t.mu.next.Next() {
   313  		left = false
   314  	} else {
   315  		log.Fatalf(ctx, "min proposal %s not tracked under closed (%s) or next (%s) timestamp", minProp, t.mu.closed, t.mu.next)
   316  	}
   317  	// If the update is from the left side, clear all existing MLAIs from the left
   318  	// to uphold the invariant that all tracked MLAIs belong to the same (and
   319  	// largest seen) epoch. It would not violate correctness to clear the data on
   320  	// the left even if the proposal being released is tracked on the right; it is
   321  	// likely that the next call to close will observe the later epoch and thus
   322  	// not read this data but the code chooses to retain it.
   323  	if left && epoch > t.mu.leftEpoch {
   324  		t.mu.leftEpoch = epoch
   325  		clearMLAIMap(t.mu.leftMLAI)
   326  	}
   327  	// The right side is bumped and cleared when the epoch increases without
   328  	// taking into account which side the current proposal is tracked under
   329  	// because bumping the left side implies that the information from the right
   330  	// side will never be retrieved by the client (as epochs only ever go up and
   331  	// the current left will be emitted before the current right side).
   332  	if epoch > t.mu.rightEpoch {
   333  		t.mu.rightEpoch = epoch
   334  		clearMLAIMap(t.mu.rightMLAI)
   335  	}
   336  	if left {
   337  		releaseProposal(ctx, "left", shouldLog, minProp, rangeID, lai,
   338  			&t.mu.leftRef, t.mu.leftMLAI, t.mu.leftEpoch != epoch)
   339  	} else {
   340  		releaseProposal(ctx, "right", shouldLog, minProp, rangeID, lai,
   341  			&t.mu.rightRef, t.mu.rightMLAI, t.mu.rightEpoch != epoch)
   342  	}
   343  }
   344  
   345  func clearMLAIMap(m map[roachpb.RangeID]ctpb.LAI) {
   346  	for rangeID := range m {
   347  		delete(m, rangeID)
   348  	}
   349  }
   350  
   351  func releaseProposal(
   352  	ctx context.Context,
   353  	side string,
   354  	shouldLog bool,
   355  	minProp hlc.Timestamp,
   356  	rangeID roachpb.RangeID,
   357  	lai ctpb.LAI,
   358  	refs *int,
   359  	mlaiMap map[roachpb.RangeID]ctpb.LAI,
   360  	fromPreviousEpoch bool,
   361  ) {
   362  	if shouldLog {
   363  		log.Infof(ctx, "release: minprop %s on r%d@%d tracked on the %s", minProp, rangeID, lai, side)
   364  	}
   365  	*refs--
   366  	if *refs < 0 {
   367  		log.Fatalf(ctx, "min proposal %s ref count < 0", side)
   368  	}
   369  	if rangeID == 0 {
   370  		return
   371  	}
   372  	if !fromPreviousEpoch {
   373  		if curLAI, found := mlaiMap[rangeID]; !found || curLAI < lai {
   374  			mlaiMap[rangeID] = lai
   375  		}
   376  	}
   377  }