github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/ccl/changefeedccl/changefeed.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Licensed as a CockroachDB Enterprise file under the Cockroach Community
     4  // License (the "License"); you may not use this file except in compliance with
     5  // the License. You may obtain a copy of the License at
     6  //
     7  //     https://github.com/cockroachdb/cockroach/blob/master/licenses/CCL.txt
     8  
     9  package changefeedccl
    10  
    11  import (
    12  	"context"
    13  	"time"
    14  
    15  	"github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/changefeedbase"
    16  	"github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/kvfeed"
    17  	"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
    18  	"github.com/cockroachdb/cockroach/pkg/jobs/jobsprotectedts"
    19  	"github.com/cockroachdb/cockroach/pkg/keys"
    20  	"github.com/cockroachdb/cockroach/pkg/kv"
    21  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/protectedts"
    22  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    23  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    24  	"github.com/cockroachdb/cockroach/pkg/sql/catalog/lease"
    25  	"github.com/cockroachdb/cockroach/pkg/sql/row"
    26  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    27  	"github.com/cockroachdb/cockroach/pkg/util/bufalloc"
    28  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    29  	"github.com/cockroachdb/cockroach/pkg/util/log"
    30  	"github.com/cockroachdb/cockroach/pkg/util/span"
    31  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    32  	"github.com/cockroachdb/cockroach/pkg/util/uuid"
    33  	"github.com/cockroachdb/errors"
    34  )
    35  
    36  const (
    37  	jsonMetaSentinel = `__crdb__`
    38  )
    39  
    40  type emitEntry struct {
    41  	// row, if not the zero value, represents a changed row to be emitted.
    42  	row encodeRow
    43  
    44  	// resolved, if non-nil, is a guarantee for the associated
    45  	// span that no previously unseen entries with a lower or equal updated
    46  	// timestamp will be emitted.
    47  	resolved *jobspb.ResolvedSpan
    48  
    49  	// bufferGetTimestamp is the time this entry came out of the buffer.
    50  	bufferGetTimestamp time.Time
    51  }
    52  
    53  // kvsToRows gets changed kvs from a closure and converts them into sql rows. It
    54  // returns a closure that may be repeatedly called to advance the changefeed.
    55  // The returned closure is not threadsafe.
    56  func kvsToRows(
    57  	codec keys.SQLCodec,
    58  	leaseMgr *lease.Manager,
    59  	details jobspb.ChangefeedDetails,
    60  	inputFn func(context.Context) (kvfeed.Event, error),
    61  ) func(context.Context) ([]emitEntry, error) {
    62  	_, withDiff := details.Opts[changefeedbase.OptDiff]
    63  	rfCache := newRowFetcherCache(codec, leaseMgr)
    64  
    65  	var kvs row.SpanKVFetcher
    66  	appendEmitEntryForKV := func(
    67  		ctx context.Context,
    68  		output []emitEntry,
    69  		kv roachpb.KeyValue,
    70  		prevVal roachpb.Value,
    71  		schemaTimestamp hlc.Timestamp,
    72  		prevSchemaTimestamp hlc.Timestamp,
    73  		bufferGetTimestamp time.Time,
    74  	) ([]emitEntry, error) {
    75  
    76  		desc, err := rfCache.TableDescForKey(ctx, kv.Key, schemaTimestamp)
    77  		if err != nil {
    78  			return nil, err
    79  		}
    80  		if _, ok := details.Targets[desc.ID]; !ok {
    81  			// This kv is for an interleaved table that we're not watching.
    82  			if log.V(3) {
    83  				log.Infof(ctx, `skipping key from unwatched table %s: %s`, desc.Name, kv.Key)
    84  			}
    85  			return nil, nil
    86  		}
    87  
    88  		rf, err := rfCache.RowFetcherForTableDesc(desc)
    89  		if err != nil {
    90  			return nil, err
    91  		}
    92  
    93  		// Get new value.
    94  		var r emitEntry
    95  		r.bufferGetTimestamp = bufferGetTimestamp
    96  		{
    97  			// TODO(dan): Handle tables with multiple column families.
    98  			// Reuse kvs to save allocations.
    99  			kvs.KVs = kvs.KVs[:0]
   100  			kvs.KVs = append(kvs.KVs, kv)
   101  			if err := rf.StartScanFrom(ctx, &kvs); err != nil {
   102  				return nil, err
   103  			}
   104  
   105  			r.row.datums, r.row.tableDesc, _, err = rf.NextRow(ctx)
   106  			if err != nil {
   107  				return nil, err
   108  			}
   109  			if r.row.datums == nil {
   110  				return nil, errors.AssertionFailedf("unexpected empty datums")
   111  			}
   112  			r.row.datums = append(sqlbase.EncDatumRow(nil), r.row.datums...)
   113  			r.row.deleted = rf.RowIsDeleted()
   114  			r.row.updated = schemaTimestamp
   115  
   116  			// Assert that we don't get a second row from the row.Fetcher. We
   117  			// fed it a single KV, so that would be surprising.
   118  			var nextRow emitEntry
   119  			nextRow.row.datums, nextRow.row.tableDesc, _, err = rf.NextRow(ctx)
   120  			if err != nil {
   121  				return nil, err
   122  			}
   123  			if nextRow.row.datums != nil {
   124  				return nil, errors.AssertionFailedf("unexpected non-empty datums")
   125  			}
   126  		}
   127  
   128  		// Get prev value, if necessary.
   129  		if withDiff {
   130  			prevRF := rf
   131  			if prevSchemaTimestamp != schemaTimestamp {
   132  				// If the previous value is being interpreted under a different
   133  				// version of the schema, fetch the correct table descriptor and
   134  				// create a new row.Fetcher with it.
   135  				prevDesc, err := rfCache.TableDescForKey(ctx, kv.Key, prevSchemaTimestamp)
   136  				if err != nil {
   137  					return nil, err
   138  				}
   139  
   140  				prevRF, err = rfCache.RowFetcherForTableDesc(prevDesc)
   141  				if err != nil {
   142  					return nil, err
   143  				}
   144  			}
   145  
   146  			prevKV := roachpb.KeyValue{Key: kv.Key, Value: prevVal}
   147  			// TODO(dan): Handle tables with multiple column families.
   148  			// Reuse kvs to save allocations.
   149  			kvs.KVs = kvs.KVs[:0]
   150  			kvs.KVs = append(kvs.KVs, prevKV)
   151  			if err := prevRF.StartScanFrom(ctx, &kvs); err != nil {
   152  				return nil, err
   153  			}
   154  			r.row.prevDatums, r.row.prevTableDesc, _, err = prevRF.NextRow(ctx)
   155  			if err != nil {
   156  				return nil, err
   157  			}
   158  			if r.row.prevDatums == nil {
   159  				return nil, errors.AssertionFailedf("unexpected empty datums")
   160  			}
   161  			r.row.prevDatums = append(sqlbase.EncDatumRow(nil), r.row.prevDatums...)
   162  			r.row.prevDeleted = prevRF.RowIsDeleted()
   163  
   164  			// Assert that we don't get a second row from the row.Fetcher. We
   165  			// fed it a single KV, so that would be surprising.
   166  			var nextRow emitEntry
   167  			nextRow.row.prevDatums, nextRow.row.prevTableDesc, _, err = prevRF.NextRow(ctx)
   168  			if err != nil {
   169  				return nil, err
   170  			}
   171  			if nextRow.row.prevDatums != nil {
   172  				return nil, errors.AssertionFailedf("unexpected non-empty datums")
   173  			}
   174  		}
   175  
   176  		output = append(output, r)
   177  		return output, nil
   178  	}
   179  
   180  	var output []emitEntry
   181  	return func(ctx context.Context) ([]emitEntry, error) {
   182  		// Reuse output to save allocations.
   183  		output = output[:0]
   184  		for {
   185  			input, err := inputFn(ctx)
   186  			if err != nil {
   187  				return nil, err
   188  			}
   189  			switch input.Type() {
   190  			case kvfeed.KVEvent:
   191  				kv := input.KV()
   192  				if log.V(3) {
   193  					log.Infof(ctx, "changed key %s %s", kv.Key, kv.Value.Timestamp)
   194  				}
   195  				schemaTimestamp := kv.Value.Timestamp
   196  				prevSchemaTimestamp := schemaTimestamp
   197  				if backfillTs := input.BackfillTimestamp(); backfillTs != (hlc.Timestamp{}) {
   198  					schemaTimestamp = backfillTs
   199  					prevSchemaTimestamp = schemaTimestamp.Prev()
   200  				}
   201  				output, err = appendEmitEntryForKV(
   202  					ctx, output, kv, input.PrevValue(),
   203  					schemaTimestamp, prevSchemaTimestamp,
   204  					input.BufferGetTimestamp())
   205  				if err != nil {
   206  					return nil, err
   207  				}
   208  			case kvfeed.ResolvedEvent:
   209  				output = append(output, emitEntry{
   210  					resolved:           input.Resolved(),
   211  					bufferGetTimestamp: input.BufferGetTimestamp(),
   212  				})
   213  			}
   214  			if output != nil {
   215  				return output, nil
   216  			}
   217  		}
   218  	}
   219  }
   220  
   221  // emitEntries connects to a sink, receives rows from a closure, and repeatedly
   222  // emits them to the sink. It returns a closure that may be repeatedly called to
   223  // advance the changefeed and which returns span-level resolved timestamp
   224  // updates. The returned closure is not threadsafe. Note that rows read from
   225  // `inputFn` which precede or equal the Frontier of `sf` will not be emitted
   226  // because they're provably duplicates.
   227  func emitEntries(
   228  	settings *cluster.Settings,
   229  	details jobspb.ChangefeedDetails,
   230  	cursor hlc.Timestamp,
   231  	sf *span.Frontier,
   232  	encoder Encoder,
   233  	sink Sink,
   234  	inputFn func(context.Context) ([]emitEntry, error),
   235  	knobs TestingKnobs,
   236  	metrics *Metrics,
   237  ) func(context.Context) ([]jobspb.ResolvedSpan, error) {
   238  	var scratch bufalloc.ByteAllocator
   239  	emitRowFn := func(ctx context.Context, row encodeRow) error {
   240  		// Ensure that row updates are strictly newer than the least resolved timestamp
   241  		// being tracked by the local span frontier. The poller should not be forwarding
   242  		// row updates that have timestamps less than or equal to any resolved timestamp
   243  		// it's forwarded before.
   244  		// TODO(dan): This should be an assertion once we're confident this can never
   245  		// happen under any circumstance.
   246  		if row.updated.LessEq(sf.Frontier()) && !row.updated.Equal(cursor) {
   247  			log.Errorf(ctx, "cdc ux violation: detected timestamp %s that is less than "+
   248  				"or equal to the local frontier %s.", cloudStorageFormatTime(row.updated),
   249  				cloudStorageFormatTime(sf.Frontier()))
   250  			return nil
   251  		}
   252  		var keyCopy, valueCopy []byte
   253  		encodedKey, err := encoder.EncodeKey(ctx, row)
   254  		if err != nil {
   255  			return err
   256  		}
   257  		scratch, keyCopy = scratch.Copy(encodedKey, 0 /* extraCap */)
   258  		encodedValue, err := encoder.EncodeValue(ctx, row)
   259  		if err != nil {
   260  			return err
   261  		}
   262  		scratch, valueCopy = scratch.Copy(encodedValue, 0 /* extraCap */)
   263  
   264  		if knobs.BeforeEmitRow != nil {
   265  			if err := knobs.BeforeEmitRow(ctx); err != nil {
   266  				return err
   267  			}
   268  		}
   269  		if err := sink.EmitRow(
   270  			ctx, row.tableDesc, keyCopy, valueCopy, row.updated,
   271  		); err != nil {
   272  			return err
   273  		}
   274  		if log.V(3) {
   275  			log.Infof(ctx, `row %s: %s -> %s`, row.tableDesc.Name, keyCopy, valueCopy)
   276  		}
   277  		return nil
   278  	}
   279  
   280  	var lastFlush time.Time
   281  	// TODO(dan): We could keep these in `sf` to eliminate dups.
   282  	var resolvedSpans []jobspb.ResolvedSpan
   283  
   284  	return func(ctx context.Context) ([]jobspb.ResolvedSpan, error) {
   285  		inputs, err := inputFn(ctx)
   286  		if err != nil {
   287  			return nil, err
   288  		}
   289  		var boundaryReached bool
   290  		for _, input := range inputs {
   291  			if input.bufferGetTimestamp == (time.Time{}) {
   292  				// We could gracefully handle this instead of panic'ing, but
   293  				// we'd really like to be able to reason about this data, so
   294  				// instead we're defensive. If this is ever seen in prod without
   295  				// breaking a unit test, then we have a pretty severe test
   296  				// coverage issue.
   297  				panic(`unreachable: bufferGetTimestamp is set by all codepaths`)
   298  			}
   299  			processingNanos := timeutil.Since(input.bufferGetTimestamp).Nanoseconds()
   300  			metrics.ProcessingNanos.Inc(processingNanos)
   301  
   302  			if input.row.datums != nil {
   303  				if err := emitRowFn(ctx, input.row); err != nil {
   304  					return nil, err
   305  				}
   306  			}
   307  			if input.resolved != nil {
   308  				boundaryReached = boundaryReached || input.resolved.BoundaryReached
   309  				_ = sf.Forward(input.resolved.Span, input.resolved.Timestamp)
   310  				resolvedSpans = append(resolvedSpans, *input.resolved)
   311  			}
   312  		}
   313  
   314  		// If the resolved timestamp frequency is specified, use it as a rough
   315  		// approximation of how latency-sensitive the changefeed user is. If
   316  		// it's not, fall back to the poll interval.
   317  		//
   318  		// The current poller implementation means we emit a changefeed-level
   319  		// resolved timestamps to the user once per changefeedPollInterval. This
   320  		// buffering adds on average timeBetweenFlushes/2 to that latency. With
   321  		// timeBetweenFlushes and changefeedPollInterval both set to 1s, TPCC
   322  		// was seeing about 100x more time spent emitting than flushing.
   323  		// Dividing by 5 tries to balance these a bit, but ultimately is fairly
   324  		// unprincipled.
   325  		//
   326  		// NB: As long as we periodically get new span-level resolved timestamps
   327  		// from the poller (which should always happen, even if the watched data
   328  		// is not changing), then this is sufficient and we don't have to do
   329  		// anything fancy with timers.
   330  		var timeBetweenFlushes time.Duration
   331  		if r, ok := details.Opts[changefeedbase.OptResolvedTimestamps]; ok && r != `` {
   332  			var err error
   333  			if timeBetweenFlushes, err = time.ParseDuration(r); err != nil {
   334  				return nil, err
   335  			}
   336  		} else {
   337  			timeBetweenFlushes = changefeedbase.TableDescriptorPollInterval.Get(&settings.SV) / 5
   338  		}
   339  		if len(resolvedSpans) == 0 ||
   340  			(timeutil.Since(lastFlush) < timeBetweenFlushes && !boundaryReached) {
   341  			return nil, nil
   342  		}
   343  
   344  		// Make sure to flush the sink before forwarding resolved spans,
   345  		// otherwise, we could lose buffered messages and violate the
   346  		// at-least-once guarantee. This is also true for checkpointing the
   347  		// resolved spans in the job progress.
   348  		if err := sink.Flush(ctx); err != nil {
   349  			return nil, err
   350  		}
   351  		lastFlush = timeutil.Now()
   352  		if knobs.AfterSinkFlush != nil {
   353  			if err := knobs.AfterSinkFlush(); err != nil {
   354  				return nil, err
   355  			}
   356  		}
   357  		ret := append([]jobspb.ResolvedSpan(nil), resolvedSpans...)
   358  		resolvedSpans = resolvedSpans[:0]
   359  		return ret, nil
   360  	}
   361  }
   362  
   363  // emitResolvedTimestamp emits a changefeed-level resolved timestamp to the
   364  // sink.
   365  func emitResolvedTimestamp(
   366  	ctx context.Context, encoder Encoder, sink Sink, resolved hlc.Timestamp,
   367  ) error {
   368  	// TODO(dan): Emit more fine-grained (table level) resolved
   369  	// timestamps.
   370  	if err := sink.EmitResolvedTimestamp(ctx, encoder, resolved); err != nil {
   371  		return err
   372  	}
   373  	if log.V(2) {
   374  		log.Infof(ctx, `resolved %s`, resolved)
   375  	}
   376  	return nil
   377  }
   378  
   379  // createProtectedTimestampRecord will create a record to protect the spans for
   380  // this changefeed at the resolved timestamp. The progress struct will be
   381  // updated to refer to this new protected timestamp record.
   382  func createProtectedTimestampRecord(
   383  	ctx context.Context,
   384  	pts protectedts.Storage,
   385  	txn *kv.Txn,
   386  	jobID int64,
   387  	targets jobspb.ChangefeedTargets,
   388  	resolved hlc.Timestamp,
   389  	progress *jobspb.ChangefeedProgress,
   390  ) error {
   391  	progress.ProtectedTimestampRecord = uuid.MakeV4()
   392  	log.VEventf(ctx, 2, "creating protected timestamp %v at %v",
   393  		progress.ProtectedTimestampRecord, resolved)
   394  	spansToProtect := makeSpansToProtect(targets)
   395  	rec := jobsprotectedts.MakeRecord(
   396  		progress.ProtectedTimestampRecord, jobID, resolved, spansToProtect)
   397  	return pts.Protect(ctx, txn, rec)
   398  }
   399  
   400  func makeSpansToProtect(targets jobspb.ChangefeedTargets) []roachpb.Span {
   401  	// NB: We add 1 because we're also going to protect system.descriptors.
   402  	// We protect system.descriptors because a changefeed needs all of the history
   403  	// of table descriptors to version data.
   404  	spansToProtect := make([]roachpb.Span, 0, len(targets)+1)
   405  	addTablePrefix := func(id uint32) {
   406  		tablePrefix := keys.TODOSQLCodec.TablePrefix(id)
   407  		spansToProtect = append(spansToProtect, roachpb.Span{
   408  			Key:    tablePrefix,
   409  			EndKey: tablePrefix.PrefixEnd(),
   410  		})
   411  	}
   412  	for t := range targets {
   413  		addTablePrefix(uint32(t))
   414  	}
   415  	addTablePrefix(keys.DescriptorTableID)
   416  	return spansToProtect
   417  }
   418  
   419  // initialScanFromOptions returns whether or not the options indicate the need
   420  // for an initial scan on the first run.
   421  func initialScanFromOptions(opts map[string]string) bool {
   422  	_, cursor := opts[changefeedbase.OptCursor]
   423  	_, initialScan := opts[changefeedbase.OptInitialScan]
   424  	_, noInitialScan := opts[changefeedbase.OptNoInitialScan]
   425  	return (cursor && initialScan) || (!cursor && !noInitialScan)
   426  }