github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/rangefeed/registry.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package rangefeed
    12  
    13  import (
    14  	"bytes"
    15  	"context"
    16  	"fmt"
    17  	"sync"
    18  	"time"
    19  
    20  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    21  	"github.com/cockroachdb/cockroach/pkg/storage"
    22  	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
    23  	"github.com/cockroachdb/cockroach/pkg/util/bufalloc"
    24  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    25  	"github.com/cockroachdb/cockroach/pkg/util/interval"
    26  	"github.com/cockroachdb/cockroach/pkg/util/log"
    27  	"github.com/cockroachdb/cockroach/pkg/util/protoutil"
    28  	"github.com/cockroachdb/cockroach/pkg/util/retry"
    29  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    30  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    31  	"github.com/cockroachdb/errors"
    32  )
    33  
    34  // Stream is a object capable of transmitting RangeFeedEvents.
    35  type Stream interface {
    36  	// Context returns the context for this stream.
    37  	Context() context.Context
    38  	// Send blocks until it sends m, the stream is done, or the stream breaks.
    39  	// Send must be safe to call on the same stream in different goroutines.
    40  	Send(*roachpb.RangeFeedEvent) error
    41  }
    42  
    43  // registration is an instance of a rangefeed subscriber who has
    44  // registered to receive updates for a specific range of keys.
    45  // Updates are delivered to its stream until one of the following
    46  // conditions is met:
    47  // 1. a Send to the Stream returns an error
    48  // 2. the Stream's context is canceled
    49  // 3. the registration is manually unregistered
    50  //
    51  // In all cases, when a registration is unregistered its error
    52  // channel is sent an error to inform it that the registration
    53  // has finished.
    54  type registration struct {
    55  	// Input.
    56  	span             roachpb.Span
    57  	catchupTimestamp hlc.Timestamp
    58  	catchupIter      storage.SimpleIterator
    59  	withDiff         bool
    60  	metrics          *Metrics
    61  
    62  	// Output.
    63  	stream Stream
    64  	errC   chan<- *roachpb.Error
    65  
    66  	// Internal.
    67  	id   int64
    68  	keys interval.Range
    69  	buf  chan *roachpb.RangeFeedEvent
    70  
    71  	mu struct {
    72  		sync.Locker
    73  		// True if this registration buffer has overflowed, dropping a live event.
    74  		// This will cause the registration to exit with an error once the buffer
    75  		// has been emptied.
    76  		overflowed bool
    77  		// Boolean indicating if all events have been output to stream. Used only
    78  		// for testing.
    79  		caughtUp bool
    80  		// Management of the output loop goroutine, used to ensure proper teardown.
    81  		outputLoopCancelFn func()
    82  		disconnected       bool
    83  	}
    84  }
    85  
    86  func newRegistration(
    87  	span roachpb.Span,
    88  	startTS hlc.Timestamp,
    89  	catchupIter storage.SimpleIterator,
    90  	withDiff bool,
    91  	bufferSz int,
    92  	metrics *Metrics,
    93  	stream Stream,
    94  	errC chan<- *roachpb.Error,
    95  ) registration {
    96  	r := registration{
    97  		span:             span,
    98  		catchupTimestamp: startTS,
    99  		catchupIter:      catchupIter,
   100  		withDiff:         withDiff,
   101  		metrics:          metrics,
   102  		stream:           stream,
   103  		errC:             errC,
   104  		buf:              make(chan *roachpb.RangeFeedEvent, bufferSz),
   105  	}
   106  	r.mu.Locker = &syncutil.Mutex{}
   107  	r.mu.caughtUp = true
   108  	return r
   109  }
   110  
   111  // publish attempts to send a single event to the output buffer for this
   112  // registration. If the output buffer is full, the overflowed flag is set,
   113  // indicating that live events were lost and a catchup scan should be initiated.
   114  // If overflowed is already set, events are ignored and not written to the
   115  // buffer.
   116  func (r *registration) publish(event *roachpb.RangeFeedEvent) {
   117  	r.validateEvent(event)
   118  	event = r.maybeStripEvent(event)
   119  
   120  	r.mu.Lock()
   121  	defer r.mu.Unlock()
   122  	if r.mu.overflowed {
   123  		return
   124  	}
   125  	select {
   126  	case r.buf <- event:
   127  		r.mu.caughtUp = false
   128  	default:
   129  		// Buffer exceeded and we are dropping this event. Registration will need
   130  		// a catch-up scan.
   131  		r.mu.overflowed = true
   132  	}
   133  }
   134  
   135  // validateEvent checks that the event contains enough information for the
   136  // registation.
   137  func (r *registration) validateEvent(event *roachpb.RangeFeedEvent) {
   138  	switch t := event.GetValue().(type) {
   139  	case *roachpb.RangeFeedValue:
   140  		if t.Key == nil {
   141  			panic(fmt.Sprintf("unexpected empty RangeFeedValue.Key: %v", t))
   142  		}
   143  		if t.Value.RawBytes == nil {
   144  			panic(fmt.Sprintf("unexpected empty RangeFeedValue.Value.RawBytes: %v", t))
   145  		}
   146  		if t.Value.Timestamp.IsEmpty() {
   147  			panic(fmt.Sprintf("unexpected empty RangeFeedValue.Value.Timestamp: %v", t))
   148  		}
   149  	case *roachpb.RangeFeedCheckpoint:
   150  		if t.Span.Key == nil {
   151  			panic(fmt.Sprintf("unexpected empty RangeFeedCheckpoint.Span.Key: %v", t))
   152  		}
   153  	default:
   154  		panic(fmt.Sprintf("unexpected RangeFeedEvent variant: %v", t))
   155  	}
   156  }
   157  
   158  // maybeStripEvent determines whether the event contains excess information not
   159  // applicable to the current registration. If so, it makes a copy of the event
   160  // and strips the incompatible information to match only what the registration
   161  // requested.
   162  func (r *registration) maybeStripEvent(event *roachpb.RangeFeedEvent) *roachpb.RangeFeedEvent {
   163  	ret := event
   164  	copyOnWrite := func() interface{} {
   165  		if ret == event {
   166  			ret = event.ShallowCopy()
   167  		}
   168  		return ret.GetValue()
   169  	}
   170  
   171  	switch t := ret.GetValue().(type) {
   172  	case *roachpb.RangeFeedValue:
   173  		if t.PrevValue.IsPresent() && !r.withDiff {
   174  			// If no registrations for the current Range are requesting previous
   175  			// values, then we won't even retrieve them on the Raft goroutine.
   176  			// However, if any are and they overlap with an update then the
   177  			// previous value on the corresponding events will be populated.
   178  			// If we're in this case and any other registrations don't want
   179  			// previous values then we'll need to strip them.
   180  			t = copyOnWrite().(*roachpb.RangeFeedValue)
   181  			t.PrevValue = roachpb.Value{}
   182  		}
   183  	case *roachpb.RangeFeedCheckpoint:
   184  		if !t.Span.EqualValue(r.span) {
   185  			// Checkpoint events are always created spanning the entire Range.
   186  			// However, a registration might not be listening on updates over
   187  			// the entire Range. If this is the case then we need to constrain
   188  			// the checkpoint events published to that registration to just the
   189  			// span that it's listening on. This is more than just a convenience
   190  			// to consumers - it would be incorrect to say that a rangefeed has
   191  			// observed all values up to the checkpoint timestamp over a given
   192  			// key span if any updates to that span have been filtered out.
   193  			if !t.Span.Contains(r.span) {
   194  				panic(fmt.Sprintf("registration span %v larger than checkpoint span %v", r.span, t.Span))
   195  			}
   196  			t = copyOnWrite().(*roachpb.RangeFeedCheckpoint)
   197  			t.Span = r.span
   198  		}
   199  	default:
   200  		panic(fmt.Sprintf("unexpected RangeFeedEvent variant: %v", t))
   201  	}
   202  	return ret
   203  }
   204  
   205  // disconnect cancels the output loop context for the registration and passes an
   206  // error to the output error stream for the registration. This also sets the
   207  // disconnected flag on the registration, preventing it from being disconnected
   208  // again.
   209  func (r *registration) disconnect(pErr *roachpb.Error) {
   210  	r.mu.Lock()
   211  	defer r.mu.Unlock()
   212  	if !r.mu.disconnected {
   213  		if r.mu.outputLoopCancelFn != nil {
   214  			r.mu.outputLoopCancelFn()
   215  		}
   216  		r.mu.disconnected = true
   217  		r.errC <- pErr
   218  	}
   219  }
   220  
   221  // outputLoop is the operational loop for a single registration. The behavior
   222  // is as thus:
   223  //
   224  // 1. If a catch-up scan is indicated, run one before beginning the proper
   225  // output loop.
   226  // 2. After catch-up is complete, begin reading from the registration buffer
   227  // channel and writing to the output stream until the buffer is empty *and*
   228  // the overflow flag has been set.
   229  //
   230  // The loop exits with any error encountered, if the provided context is
   231  // canceled, or when the buffer has overflowed and all pre-overflow entries
   232  // have been emitted.
   233  func (r *registration) outputLoop(ctx context.Context) error {
   234  	// If the registration has a catch-up scan,
   235  	if r.catchupIter != nil {
   236  		if err := r.runCatchupScan(); err != nil {
   237  			err = errors.Wrap(err, "catch-up scan failed")
   238  			log.Errorf(ctx, "%v", err)
   239  			return err
   240  		}
   241  	}
   242  
   243  	// Normal buffered output loop.
   244  	for {
   245  		overflowed := false
   246  		r.mu.Lock()
   247  		if len(r.buf) == 0 {
   248  			overflowed = r.mu.overflowed
   249  			r.mu.caughtUp = true
   250  		}
   251  		r.mu.Unlock()
   252  		if overflowed {
   253  			return newErrBufferCapacityExceeded().GoError()
   254  		}
   255  
   256  		select {
   257  		case nextEvent := <-r.buf:
   258  			if err := r.stream.Send(nextEvent); err != nil {
   259  				return err
   260  			}
   261  		case <-ctx.Done():
   262  			return ctx.Err()
   263  		case <-r.stream.Context().Done():
   264  			return r.stream.Context().Err()
   265  		}
   266  	}
   267  }
   268  
   269  func (r *registration) runOutputLoop(ctx context.Context) {
   270  	r.mu.Lock()
   271  	ctx, r.mu.outputLoopCancelFn = context.WithCancel(ctx)
   272  	r.mu.Unlock()
   273  	err := r.outputLoop(ctx)
   274  	r.disconnect(roachpb.NewError(err))
   275  }
   276  
   277  // runCatchupScan starts a catchup scan which will output entries for all
   278  // recorded changes in the replica that are newer than the catchupTimestamp.
   279  // This uses the iterator provided when the registration was originally created;
   280  // after the scan completes, the iterator will be closed.
   281  func (r *registration) runCatchupScan() error {
   282  	if r.catchupIter == nil {
   283  		return nil
   284  	}
   285  	start := timeutil.Now()
   286  	defer func() {
   287  		r.catchupIter.Close()
   288  		r.catchupIter = nil
   289  		r.metrics.RangeFeedCatchupScanNanos.Inc(timeutil.Since(start).Nanoseconds())
   290  	}()
   291  
   292  	var a bufalloc.ByteAllocator
   293  	startKey := storage.MakeMVCCMetadataKey(r.span.Key)
   294  	endKey := storage.MakeMVCCMetadataKey(r.span.EndKey)
   295  
   296  	// Iterator will encounter historical values for each key in
   297  	// reverse-chronological order. To output in chronological order, store
   298  	// events for the same key until a different key is encountered, then output
   299  	// the encountered values in reverse. This also allows us to buffer events
   300  	// as we fill in previous values.
   301  	var lastKey roachpb.Key
   302  	reorderBuf := make([]roachpb.RangeFeedEvent, 0, 5)
   303  	addPrevToLastEvent := func(val []byte) {
   304  		if l := len(reorderBuf); l > 0 {
   305  			if reorderBuf[l-1].Val.PrevValue.IsPresent() {
   306  				panic("RangeFeedValue.PrevVal unexpectedly set")
   307  			}
   308  			reorderBuf[l-1].Val.PrevValue.RawBytes = val
   309  		}
   310  	}
   311  	outputEvents := func() error {
   312  		for i := len(reorderBuf) - 1; i >= 0; i-- {
   313  			e := reorderBuf[i]
   314  			if err := r.stream.Send(&e); err != nil {
   315  				return err
   316  			}
   317  		}
   318  		reorderBuf = reorderBuf[:0]
   319  		return nil
   320  	}
   321  
   322  	// Iterate though all keys using Next. We want to publish all committed
   323  	// versions of each key that are after the registration's startTS, so we
   324  	// can't use NextKey.
   325  	var meta enginepb.MVCCMetadata
   326  	r.catchupIter.SeekGE(startKey)
   327  	for {
   328  		if ok, err := r.catchupIter.Valid(); err != nil {
   329  			return err
   330  		} else if !ok || !r.catchupIter.UnsafeKey().Less(endKey) {
   331  			break
   332  		}
   333  
   334  		unsafeKey := r.catchupIter.UnsafeKey()
   335  		unsafeVal := r.catchupIter.UnsafeValue()
   336  		if !unsafeKey.IsValue() {
   337  			// Found a metadata key.
   338  			if err := protoutil.Unmarshal(unsafeVal, &meta); err != nil {
   339  				return errors.Wrapf(err, "unmarshaling mvcc meta: %v", unsafeKey)
   340  			}
   341  			if !meta.IsInline() {
   342  				// This is an MVCCMetadata key for an intent. The catchup scan
   343  				// only cares about committed values, so ignore this and skip
   344  				// past the corresponding provisional key-value. To do this,
   345  				// scan to the timestamp immediately before (i.e. the key
   346  				// immediately after) the provisional key.
   347  				r.catchupIter.SeekGE(storage.MVCCKey{
   348  					Key:       unsafeKey.Key,
   349  					Timestamp: hlc.Timestamp(meta.Timestamp).Prev(),
   350  				})
   351  				continue
   352  			}
   353  
   354  			// If write is inline, it doesn't have a timestamp so we don't
   355  			// filter on the registration's starting timestamp. Instead, we
   356  			// return all inline writes.
   357  			unsafeVal = meta.RawBytes
   358  		}
   359  
   360  		// Determine whether the iterator moved to a new key.
   361  		sameKey := bytes.Equal(unsafeKey.Key, lastKey)
   362  		if !sameKey {
   363  			// If so, output events for the last key encountered.
   364  			if err := outputEvents(); err != nil {
   365  				return err
   366  			}
   367  			a, lastKey = a.Copy(unsafeKey.Key, 0)
   368  		}
   369  		key := lastKey
   370  		ts := unsafeKey.Timestamp
   371  
   372  		// Ignore the version if it's not inline and its timestamp is at
   373  		// or before the registration's (exclusive) starting timestamp.
   374  		ignore := !(ts.IsEmpty() || r.catchupTimestamp.Less(ts))
   375  		if ignore && !r.withDiff {
   376  			// Skip all the way to the next key.
   377  			// NB: fast-path to avoid value copy when !r.withDiff.
   378  			r.catchupIter.NextKey()
   379  			continue
   380  		}
   381  
   382  		var val []byte
   383  		a, val = a.Copy(unsafeVal, 0)
   384  		if r.withDiff {
   385  			// Update the last version with its previous value (this version).
   386  			addPrevToLastEvent(val)
   387  		}
   388  
   389  		if ignore {
   390  			// Skip all the way to the next key.
   391  			r.catchupIter.NextKey()
   392  		} else {
   393  			// Move to the next version of this key.
   394  			r.catchupIter.Next()
   395  
   396  			var event roachpb.RangeFeedEvent
   397  			event.MustSetValue(&roachpb.RangeFeedValue{
   398  				Key: key,
   399  				Value: roachpb.Value{
   400  					RawBytes:  val,
   401  					Timestamp: ts,
   402  				},
   403  			})
   404  			reorderBuf = append(reorderBuf, event)
   405  		}
   406  	}
   407  
   408  	// Output events for the last key encountered.
   409  	return outputEvents()
   410  }
   411  
   412  // ID implements interval.Interface.
   413  func (r *registration) ID() uintptr {
   414  	return uintptr(r.id)
   415  }
   416  
   417  // Range implements interval.Interface.
   418  func (r *registration) Range() interval.Range {
   419  	return r.keys
   420  }
   421  
   422  func (r registration) String() string {
   423  	return fmt.Sprintf("[%s @ %s+]", r.span, r.catchupTimestamp)
   424  }
   425  
   426  // registry holds a set of registrations and manages their lifecycle.
   427  type registry struct {
   428  	tree    interval.Tree // *registration items
   429  	idAlloc int64
   430  }
   431  
   432  func makeRegistry() registry {
   433  	return registry{
   434  		tree: interval.NewTree(interval.ExclusiveOverlapper),
   435  	}
   436  }
   437  
   438  // Len returns the number of registrations in the registry.
   439  func (reg *registry) Len() int {
   440  	return reg.tree.Len()
   441  }
   442  
   443  // NewFilter returns a operation filter reflecting the registrations
   444  // in the registry.
   445  func (reg *registry) NewFilter() *Filter {
   446  	return newFilterFromRegistry(reg)
   447  }
   448  
   449  // Register adds the provided registration to the registry.
   450  func (reg *registry) Register(r *registration) {
   451  	r.id = reg.nextID()
   452  	r.keys = r.span.AsRange()
   453  	if err := reg.tree.Insert(r, false /* fast */); err != nil {
   454  		panic(err)
   455  	}
   456  }
   457  
   458  func (reg *registry) nextID() int64 {
   459  	reg.idAlloc++
   460  	return reg.idAlloc
   461  }
   462  
   463  // PublishToOverlapping publishes the provided event to all registrations whose
   464  // range overlaps the specified span.
   465  func (reg *registry) PublishToOverlapping(span roachpb.Span, event *roachpb.RangeFeedEvent) {
   466  	// Determine the earliest starting timestamp that a registration
   467  	// can have while still needing to hear about this event.
   468  	var minTS hlc.Timestamp
   469  	switch t := event.GetValue().(type) {
   470  	case *roachpb.RangeFeedValue:
   471  		// Only publish values to registrations with starting
   472  		// timestamps equal to or greater than the value's timestamp.
   473  		minTS = t.Value.Timestamp
   474  	case *roachpb.RangeFeedCheckpoint:
   475  		// Always publish checkpoint notifications, regardless of a registration's
   476  		// starting timestamp.
   477  		//
   478  		// TODO(dan): It's unclear if this is the right contract, it's certainly
   479  		// surprising. Revisit this once RangeFeed has more users.
   480  		minTS = hlc.MaxTimestamp
   481  	default:
   482  		panic(fmt.Sprintf("unexpected RangeFeedEvent variant: %v", t))
   483  	}
   484  
   485  	reg.forOverlappingRegs(span, func(r *registration) (bool, *roachpb.Error) {
   486  		// Don't publish events if they are equal to or less
   487  		// than the registration's starting timestamp.
   488  		if r.catchupTimestamp.Less(minTS) {
   489  			r.publish(event)
   490  		}
   491  		return false, nil
   492  	})
   493  }
   494  
   495  // Unregister removes a registration from the registry. It is assumed that the
   496  // registration has already been disconnected, this is intended only to clean
   497  // up the registry.
   498  func (reg *registry) Unregister(r *registration) {
   499  	if err := reg.tree.Delete(r, false /* fast */); err != nil {
   500  		panic(err)
   501  	}
   502  }
   503  
   504  // Disconnect disconnects all registrations that overlap the specified span with
   505  // a nil error.
   506  func (reg *registry) Disconnect(span roachpb.Span) {
   507  	reg.DisconnectWithErr(span, nil /* pErr */)
   508  }
   509  
   510  // DisconnectWithErr disconnects all registrations that overlap the specified
   511  // span with the provided error.
   512  func (reg *registry) DisconnectWithErr(span roachpb.Span, pErr *roachpb.Error) {
   513  	reg.forOverlappingRegs(span, func(_ *registration) (bool, *roachpb.Error) {
   514  		return true, pErr
   515  	})
   516  }
   517  
   518  // all is a span that overlaps with all registrations.
   519  var all = roachpb.Span{Key: roachpb.KeyMin, EndKey: roachpb.KeyMax}
   520  
   521  // forOverlappingRegs calls the provided function on each registration that
   522  // overlaps the span. If the function returns true for a given registration
   523  // then that registration is unregistered and the error returned by the
   524  // function is send on its corresponding error channel.
   525  func (reg *registry) forOverlappingRegs(
   526  	span roachpb.Span, fn func(*registration) (disconnect bool, pErr *roachpb.Error),
   527  ) {
   528  	var toDelete []interval.Interface
   529  	matchFn := func(i interval.Interface) (done bool) {
   530  		r := i.(*registration)
   531  		dis, pErr := fn(r)
   532  		if dis {
   533  			r.disconnect(pErr)
   534  			toDelete = append(toDelete, i)
   535  		}
   536  		return false
   537  	}
   538  	if span.EqualValue(all) {
   539  		reg.tree.Do(matchFn)
   540  	} else {
   541  		reg.tree.DoMatching(matchFn, span.AsRange())
   542  	}
   543  
   544  	if len(toDelete) == reg.tree.Len() {
   545  		reg.tree.Clear()
   546  	} else if len(toDelete) == 1 {
   547  		if err := reg.tree.Delete(toDelete[0], false /* fast */); err != nil {
   548  			panic(err)
   549  		}
   550  	} else if len(toDelete) > 1 {
   551  		for _, i := range toDelete {
   552  			if err := reg.tree.Delete(i, true /* fast */); err != nil {
   553  				panic(err)
   554  			}
   555  		}
   556  		reg.tree.AdjustRanges()
   557  	}
   558  }
   559  
   560  // Wait for this registration to completely process its internal buffer.
   561  func (r *registration) waitForCaughtUp() error {
   562  	opts := retry.Options{
   563  		InitialBackoff: 5 * time.Millisecond,
   564  		Multiplier:     2,
   565  		MaxBackoff:     10 * time.Second,
   566  		MaxRetries:     50,
   567  	}
   568  	for re := retry.Start(opts); re.Next(); {
   569  		r.mu.Lock()
   570  		caughtUp := len(r.buf) == 0 && r.mu.caughtUp
   571  		r.mu.Unlock()
   572  		if caughtUp {
   573  			return nil
   574  		}
   575  	}
   576  	return errors.Errorf("registration %v failed to empty in time", r.Range())
   577  }
   578  
   579  // waitForCaughtUp waits for all registrations overlapping the given span to
   580  // completely process their internal buffers.
   581  func (reg *registry) waitForCaughtUp(span roachpb.Span) error {
   582  	var outerErr error
   583  	reg.forOverlappingRegs(span, func(r *registration) (bool, *roachpb.Error) {
   584  		if outerErr == nil {
   585  			outerErr = r.waitForCaughtUp()
   586  		}
   587  		return false, nil
   588  	})
   589  	return outerErr
   590  }