github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/ccl/changefeedccl/changefeed_processors.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Licensed as a CockroachDB Enterprise file under the Cockroach Community
     4  // License (the "License"); you may not use this file except in compliance with
     5  // the License. You may obtain a copy of the License at
     6  //
     7  //     https://github.com/cockroachdb/cockroach/blob/master/licenses/CCL.txt
     8  
     9  package changefeedccl
    10  
    11  import (
    12  	"context"
    13  	"fmt"
    14  	"math"
    15  	"time"
    16  
    17  	"github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/changefeedbase"
    18  	"github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/kvfeed"
    19  	"github.com/cockroachdb/cockroach/pkg/jobs"
    20  	"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
    21  	"github.com/cockroachdb/cockroach/pkg/kv"
    22  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts"
    23  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/protectedts"
    24  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    25  	"github.com/cockroachdb/cockroach/pkg/sql/catalog/lease"
    26  	"github.com/cockroachdb/cockroach/pkg/sql/execinfra"
    27  	"github.com/cockroachdb/cockroach/pkg/sql/execinfrapb"
    28  	"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode"
    29  	"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror"
    30  	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
    31  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    32  	"github.com/cockroachdb/cockroach/pkg/sql/types"
    33  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    34  	"github.com/cockroachdb/cockroach/pkg/util/log"
    35  	"github.com/cockroachdb/cockroach/pkg/util/mon"
    36  	"github.com/cockroachdb/cockroach/pkg/util/protoutil"
    37  	"github.com/cockroachdb/cockroach/pkg/util/span"
    38  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    39  	"github.com/cockroachdb/cockroach/pkg/util/uuid"
    40  	"github.com/cockroachdb/errors"
    41  )
    42  
    43  type changeAggregator struct {
    44  	execinfra.ProcessorBase
    45  
    46  	flowCtx *execinfra.FlowCtx
    47  	spec    execinfrapb.ChangeAggregatorSpec
    48  	memAcc  mon.BoundAccount
    49  
    50  	// cancel shuts down the processor, both the `Next()` flow and the kvfeed.
    51  	cancel func()
    52  	// errCh contains the return values of the kvfeed.
    53  	errCh chan error
    54  	// kvFeedDoneCh is closed when the kvfeed exits.
    55  	kvFeedDoneCh chan struct{}
    56  	kvFeedMemMon *mon.BytesMonitor
    57  
    58  	// encoder is the Encoder to use for key and value serialization.
    59  	encoder Encoder
    60  	// sink is the Sink to write rows to. Resolved timestamps are never written
    61  	// by changeAggregator.
    62  	sink Sink
    63  	// tickFn is the workhorse behind Next(). It pulls kv changes from the
    64  	// buffer that poller fills, handles table leasing, converts them to rows,
    65  	// and writes them to the sink.
    66  	tickFn func(context.Context) ([]jobspb.ResolvedSpan, error)
    67  	// changedRowBuf, if non-nil, contains changed rows to be emitted. Anything
    68  	// queued in `resolvedSpanBuf` is dependent on these having been emitted, so
    69  	// this one must be empty before moving on to that one.
    70  	changedRowBuf *encDatumRowBuffer
    71  	// resolvedSpanBuf contains resolved span updates to send to changeFrontier.
    72  	// If sink is a bufferSink, it must be emptied before these are sent.
    73  	resolvedSpanBuf encDatumRowBuffer
    74  }
    75  
    76  type timestampLowerBoundOracle interface {
    77  	inclusiveLowerBoundTS() hlc.Timestamp
    78  }
    79  
    80  type changeAggregatorLowerBoundOracle struct {
    81  	sf                         *span.Frontier
    82  	initialInclusiveLowerBound hlc.Timestamp
    83  }
    84  
    85  // inclusiveLowerBoundTs is used to generate a representative timestamp to name
    86  // cloudStorageSink files. This timestamp is either the statement time (in case this
    87  // changefeed job hasn't yet seen any resolved timestamps) or the successor timestamp to
    88  // the local span frontier. This convention is chosen to preserve CDC's ordering
    89  // guarantees. See comment on cloudStorageSink for more details.
    90  func (o *changeAggregatorLowerBoundOracle) inclusiveLowerBoundTS() hlc.Timestamp {
    91  	if frontier := o.sf.Frontier(); !frontier.IsEmpty() {
    92  		// We call `Next()` here on the frontier because this allows us
    93  		// to name files using a timestamp that is an inclusive lower bound
    94  		// on the timestamps of the updates contained within the file.
    95  		// Files being created at the point this method is called are guaranteed
    96  		// to contain row updates with timestamps strictly greater than the local
    97  		// span frontier timestamp.
    98  		return frontier.Next()
    99  	}
   100  	// This should only be returned in the case where the changefeed job hasn't yet
   101  	// seen a resolved timestamp.
   102  	return o.initialInclusiveLowerBound
   103  }
   104  
   105  var _ execinfra.Processor = &changeAggregator{}
   106  var _ execinfra.RowSource = &changeAggregator{}
   107  
   108  func newChangeAggregatorProcessor(
   109  	flowCtx *execinfra.FlowCtx,
   110  	processorID int32,
   111  	spec execinfrapb.ChangeAggregatorSpec,
   112  	output execinfra.RowReceiver,
   113  ) (execinfra.Processor, error) {
   114  	ctx := flowCtx.EvalCtx.Ctx()
   115  	memMonitor := execinfra.NewMonitor(ctx, flowCtx.EvalCtx.Mon, "changeagg-mem")
   116  	ca := &changeAggregator{
   117  		flowCtx: flowCtx,
   118  		spec:    spec,
   119  		memAcc:  memMonitor.MakeBoundAccount(),
   120  	}
   121  	if err := ca.Init(
   122  		ca,
   123  		&execinfrapb.PostProcessSpec{},
   124  		nil, /* types */
   125  		flowCtx,
   126  		processorID,
   127  		output,
   128  		memMonitor,
   129  		execinfra.ProcStateOpts{
   130  			TrailingMetaCallback: func(context.Context) []execinfrapb.ProducerMetadata {
   131  				ca.close()
   132  				return nil
   133  			},
   134  		},
   135  	); err != nil {
   136  		return nil, err
   137  	}
   138  
   139  	var err error
   140  	if ca.encoder, err = getEncoder(ca.spec.Feed.Opts); err != nil {
   141  		return nil, err
   142  	}
   143  
   144  	return ca, nil
   145  }
   146  
   147  func (ca *changeAggregator) OutputTypes() []*types.T {
   148  	return changefeedResultTypes
   149  }
   150  
   151  // Start is part of the RowSource interface.
   152  func (ca *changeAggregator) Start(ctx context.Context) context.Context {
   153  	ctx, ca.cancel = context.WithCancel(ctx)
   154  	// StartInternal called at the beginning of the function because there are
   155  	// early returns if errors are detected.
   156  	ctx = ca.StartInternal(ctx, changeAggregatorProcName)
   157  
   158  	spans, sf := ca.setupSpans()
   159  	timestampOracle := &changeAggregatorLowerBoundOracle{sf: sf, initialInclusiveLowerBound: ca.spec.Feed.StatementTime}
   160  	nodeID, err := ca.flowCtx.EvalCtx.NodeID.OptionalNodeIDErr(48274)
   161  	if err != nil {
   162  		ca.MoveToDraining(err)
   163  		return ctx
   164  	}
   165  
   166  	if ca.sink, err = getSink(
   167  		ctx, ca.spec.Feed.SinkURI, nodeID, ca.spec.Feed.Opts, ca.spec.Feed.Targets,
   168  		ca.flowCtx.Cfg.Settings, timestampOracle, ca.flowCtx.Cfg.ExternalStorageFromURI,
   169  	); err != nil {
   170  		err = MarkRetryableError(err)
   171  		// Early abort in the case that there is an error creating the sink.
   172  		ca.MoveToDraining(err)
   173  		ca.cancel()
   174  		return ctx
   175  	}
   176  
   177  	// This is the correct point to set up certain hooks depending on the sink
   178  	// type.
   179  	if b, ok := ca.sink.(*bufferSink); ok {
   180  		ca.changedRowBuf = &b.buf
   181  	}
   182  
   183  	// The job registry has a set of metrics used to monitor the various jobs it
   184  	// runs. They're all stored as the `metric.Struct` interface because of
   185  	// dependency cycles.
   186  	metrics := ca.flowCtx.Cfg.JobRegistry.MetricsStruct().Changefeed.(*Metrics)
   187  	ca.sink = makeMetricsSink(metrics, ca.sink)
   188  	ca.sink = &errorWrapperSink{wrapped: ca.sink}
   189  
   190  	var knobs TestingKnobs
   191  	if cfKnobs, ok := ca.flowCtx.TestingKnobs().Changefeed.(*TestingKnobs); ok {
   192  		knobs = *cfKnobs
   193  	}
   194  
   195  	// It seems like we should also be able to use `ca.ProcessorBase.MemMonitor`
   196  	// for the poller, but there is a race between the flow's MemoryMonitor
   197  	// getting Stopped and `changeAggregator.Close`, which causes panics. Not sure
   198  	// what to do about this yet.
   199  	kvFeedMemMonCapacity := kvfeed.MemBufferDefaultCapacity
   200  	if knobs.MemBufferCapacity != 0 {
   201  		kvFeedMemMonCapacity = knobs.MemBufferCapacity
   202  	}
   203  	kvFeedMemMon := mon.MakeMonitorInheritWithLimit("kvFeed", math.MaxInt64, ca.ProcessorBase.MemMonitor)
   204  	kvFeedMemMon.Start(ctx, nil /* pool */, mon.MakeStandaloneBudget(kvFeedMemMonCapacity))
   205  	ca.kvFeedMemMon = &kvFeedMemMon
   206  
   207  	buf := kvfeed.MakeChanBuffer()
   208  	leaseMgr := ca.flowCtx.Cfg.LeaseManager.(*lease.Manager)
   209  	_, withDiff := ca.spec.Feed.Opts[changefeedbase.OptDiff]
   210  	kvfeedCfg := makeKVFeedCfg(ca.flowCtx.Cfg, leaseMgr, ca.kvFeedMemMon, ca.spec,
   211  		spans, withDiff, buf, metrics)
   212  	rowsFn := kvsToRows(ca.flowCtx.Codec(), leaseMgr, ca.spec.Feed, buf.Get)
   213  	ca.tickFn = emitEntries(ca.flowCtx.Cfg.Settings, ca.spec.Feed,
   214  		kvfeedCfg.InitialHighWater, sf, ca.encoder, ca.sink, rowsFn, knobs, metrics)
   215  	ca.startKVFeed(ctx, kvfeedCfg)
   216  
   217  	return ctx
   218  }
   219  
   220  func (ca *changeAggregator) startKVFeed(ctx context.Context, kvfeedCfg kvfeed.Config) {
   221  	// Give errCh enough buffer both possible errors from supporting goroutines,
   222  	// but only the first one is ever used.
   223  	ca.errCh = make(chan error, 2)
   224  	ca.kvFeedDoneCh = make(chan struct{})
   225  	if err := ca.flowCtx.Stopper().RunAsyncTask(ctx, "changefeed-poller", func(ctx context.Context) {
   226  		defer close(ca.kvFeedDoneCh)
   227  		// Trying to call MoveToDraining here is racy (`MoveToDraining called in
   228  		// state stateTrailingMeta`), so return the error via a channel.
   229  		ca.errCh <- kvfeed.Run(ctx, kvfeedCfg)
   230  		ca.cancel()
   231  	}); err != nil {
   232  		// If err != nil then the RunAsyncTask closure never ran, which means we
   233  		// need to manually close ca.kvFeedDoneCh so `(*changeAggregator).close`
   234  		// doesn't hang.
   235  		close(ca.kvFeedDoneCh)
   236  		ca.errCh <- err
   237  		ca.cancel()
   238  	}
   239  }
   240  
   241  func makeKVFeedCfg(
   242  	cfg *execinfra.ServerConfig,
   243  	leaseMgr *lease.Manager,
   244  	mm *mon.BytesMonitor,
   245  	spec execinfrapb.ChangeAggregatorSpec,
   246  	spans []roachpb.Span,
   247  	withDiff bool,
   248  	buf kvfeed.EventBuffer,
   249  	metrics *Metrics,
   250  ) kvfeed.Config {
   251  	schemaChangeEvents := changefeedbase.SchemaChangeEventClass(
   252  		spec.Feed.Opts[changefeedbase.OptSchemaChangeEvents])
   253  	schemaChangePolicy := changefeedbase.SchemaChangePolicy(
   254  		spec.Feed.Opts[changefeedbase.OptSchemaChangePolicy])
   255  	initialHighWater, needsInitialScan := getKVFeedInitialParameters(spec)
   256  	kvfeedCfg := kvfeed.Config{
   257  		Sink:               buf,
   258  		Settings:           cfg.Settings,
   259  		DB:                 cfg.DB,
   260  		Clock:              cfg.DB.Clock(),
   261  		Gossip:             cfg.Gossip,
   262  		Spans:              spans,
   263  		Targets:            spec.Feed.Targets,
   264  		LeaseMgr:           leaseMgr,
   265  		Metrics:            &metrics.KVFeedMetrics,
   266  		MM:                 mm,
   267  		InitialHighWater:   initialHighWater,
   268  		WithDiff:           withDiff,
   269  		NeedsInitialScan:   needsInitialScan,
   270  		SchemaChangeEvents: schemaChangeEvents,
   271  		SchemaChangePolicy: schemaChangePolicy,
   272  	}
   273  	return kvfeedCfg
   274  }
   275  
   276  // getKVFeedInitialParameters determines the starting timestamp for the kv and
   277  // whether or not an initial scan is needed. The need for an initial scan is
   278  // determined by whether the watched in the spec have a resolved timestamp. The
   279  // higher layers mark each watch with the checkpointed resolved timestamp if no
   280  // initial scan is needed.
   281  //
   282  // TODO(ajwerner): Utilize this partial checkpointing, especially in the face of
   283  // of logical backfills of a single table while progress is made on others or
   284  // get rid of it. See https://github.com/cockroachdb/cockroach/issues/43896.
   285  func getKVFeedInitialParameters(
   286  	spec execinfrapb.ChangeAggregatorSpec,
   287  ) (initialHighWater hlc.Timestamp, needsInitialScan bool) {
   288  	for _, watch := range spec.Watches {
   289  		if initialHighWater.IsEmpty() || watch.InitialResolved.Less(initialHighWater) {
   290  			initialHighWater = watch.InitialResolved
   291  		}
   292  	}
   293  	// This will be true in the case where we have no cursor and we've never
   294  	// checkpointed a resolved timestamp or we have a cursor but we want an
   295  	// initial scan. The higher levels will coordinate that we only have empty
   296  	// watches when we need an initial scan.
   297  	if needsInitialScan = initialHighWater.IsEmpty(); needsInitialScan {
   298  		initialHighWater = spec.Feed.StatementTime
   299  	}
   300  	return initialHighWater, needsInitialScan
   301  }
   302  
   303  // setupSpans is called on start to extract the spans for this changefeed as a
   304  // slice and creates a span frontier with the initial resolved timestampsc. This
   305  // SpanFrontier only tracks the spans being watched on this node. There is a
   306  // different SpanFrontier elsewhere for the entire changefeed. This object is
   307  // used to filter out some previously emitted rows, and by the cloudStorageSink
   308  // to name its output files in lexicographically monotonic fashion.
   309  func (ca *changeAggregator) setupSpans() ([]roachpb.Span, *span.Frontier) {
   310  	spans := make([]roachpb.Span, 0, len(ca.spec.Watches))
   311  	for _, watch := range ca.spec.Watches {
   312  		spans = append(spans, watch.Span)
   313  	}
   314  	sf := span.MakeFrontier(spans...)
   315  	for _, watch := range ca.spec.Watches {
   316  		sf.Forward(watch.Span, watch.InitialResolved)
   317  	}
   318  	return spans, sf
   319  }
   320  
   321  // close has two purposes: to synchronize on the completion of the helper
   322  // goroutines created by the Start method, and to clean up any resources used by
   323  // the processor. Due to the fact that this method may be called even if the
   324  // processor did not finish completion, there is an excessive amount of nil
   325  // checking.
   326  func (ca *changeAggregator) close() {
   327  	if ca.InternalClose() {
   328  		// Shut down the poller if it wasn't already.
   329  		if ca.cancel != nil {
   330  			ca.cancel()
   331  		}
   332  		// Wait for the poller to finish shutting down.
   333  		if ca.kvFeedDoneCh != nil {
   334  			<-ca.kvFeedDoneCh
   335  		}
   336  		if ca.sink != nil {
   337  			if err := ca.sink.Close(); err != nil {
   338  				log.Warningf(ca.Ctx, `error closing sink. goroutines may have leaked: %v`, err)
   339  			}
   340  		}
   341  		ca.memAcc.Close(ca.Ctx)
   342  		if ca.kvFeedMemMon != nil {
   343  			ca.kvFeedMemMon.Stop(ca.Ctx)
   344  		}
   345  		ca.MemMonitor.Stop(ca.Ctx)
   346  	}
   347  }
   348  
   349  // Next is part of the RowSource interface.
   350  func (ca *changeAggregator) Next() (sqlbase.EncDatumRow, *execinfrapb.ProducerMetadata) {
   351  	for ca.State == execinfra.StateRunning {
   352  		if !ca.changedRowBuf.IsEmpty() {
   353  			return ca.changedRowBuf.Pop(), nil
   354  		} else if !ca.resolvedSpanBuf.IsEmpty() {
   355  			return ca.resolvedSpanBuf.Pop(), nil
   356  		}
   357  		if err := ca.tick(); err != nil {
   358  			select {
   359  			// If the poller errored first, that's the
   360  			// interesting one, so overwrite `err`.
   361  			case err = <-ca.errCh:
   362  			default:
   363  			}
   364  			// Shut down the poller if it wasn't already.
   365  			ca.cancel()
   366  
   367  			ca.MoveToDraining(err)
   368  			break
   369  		}
   370  	}
   371  	return nil, ca.DrainHelper()
   372  }
   373  
   374  func (ca *changeAggregator) tick() error {
   375  	resolvedSpans, err := ca.tickFn(ca.Ctx)
   376  	if err != nil {
   377  		return err
   378  	}
   379  
   380  	for _, resolvedSpan := range resolvedSpans {
   381  		resolvedBytes, err := protoutil.Marshal(&resolvedSpan)
   382  		if err != nil {
   383  			return err
   384  		}
   385  		// Enqueue a row to be returned that indicates some span-level resolved
   386  		// timestamp has advanced. If any rows were queued in `sink`, they must
   387  		// be emitted first.
   388  		ca.resolvedSpanBuf.Push(sqlbase.EncDatumRow{
   389  			sqlbase.EncDatum{Datum: tree.NewDBytes(tree.DBytes(resolvedBytes))},
   390  			sqlbase.EncDatum{Datum: tree.DNull}, // topic
   391  			sqlbase.EncDatum{Datum: tree.DNull}, // key
   392  			sqlbase.EncDatum{Datum: tree.DNull}, // value
   393  		})
   394  	}
   395  	return nil
   396  }
   397  
   398  // ConsumerDone is part of the RowSource interface.
   399  func (ca *changeAggregator) ConsumerDone() {
   400  	ca.MoveToDraining(nil /* err */)
   401  }
   402  
   403  // ConsumerClosed is part of the RowSource interface.
   404  func (ca *changeAggregator) ConsumerClosed() {
   405  	// The consumer is done, Next() will not be called again.
   406  	ca.InternalClose()
   407  }
   408  
   409  const (
   410  	emitAllResolved = 0
   411  	emitNoResolved  = -1
   412  )
   413  
   414  type changeFrontier struct {
   415  	execinfra.ProcessorBase
   416  
   417  	flowCtx *execinfra.FlowCtx
   418  	spec    execinfrapb.ChangeFrontierSpec
   419  	memAcc  mon.BoundAccount
   420  	a       sqlbase.DatumAlloc
   421  
   422  	// input returns rows from one or more changeAggregator processors
   423  	input execinfra.RowSource
   424  
   425  	// sf contains the current resolved timestamp high-water for the tracked
   426  	// span set.
   427  	sf *span.Frontier
   428  	// encoder is the Encoder to use for resolved timestamp serialization.
   429  	encoder Encoder
   430  	// sink is the Sink to write resolved timestamps to. Rows are never written
   431  	// by changeFrontier.
   432  	sink Sink
   433  	// freqEmitResolved, if >= 0, is a lower bound on the duration between
   434  	// resolved timestamp emits.
   435  	freqEmitResolved time.Duration
   436  	// lastEmitResolved is the last time a resolved timestamp was emitted.
   437  	lastEmitResolved time.Time
   438  	// lastSlowSpanLog is the last time a slow span from `sf` was logged.
   439  	lastSlowSpanLog time.Time
   440  
   441  	// schemaChangeBoundary represents an hlc timestamp at which a schema change
   442  	// event occurred to a target watched by this frontier. If the changefeed is
   443  	// configured to stop on schema change then the changeFrontier will wait for
   444  	// the span frontier to reach the schemaChangeBoundary, will drain, and then
   445  	// will exit. If the changefeed is configured to backfill on schema changes,
   446  	// the changeFrontier will protect the scan timestamp in order to ensure that
   447  	// the scan complete. The protected timestamp will be released when a new scan
   448  	// schemaChangeBoundary is created or the changefeed reaches a timestamp that
   449  	// is near the present.
   450  	//
   451  	// schemaChangeBoundary values are communicated to the changeFrontier via
   452  	// Resolved messages send from the changeAggregators. The policy regarding
   453  	// which schema change events lead to a schemaChangeBoundary is controlled
   454  	// by the KV feed based on OptSchemaChangeEvents and OptSchemaChangePolicy.
   455  	schemaChangeBoundary hlc.Timestamp
   456  
   457  	// jobProgressedFn, if non-nil, is called to checkpoint the changefeed's
   458  	// progress in the corresponding system job entry.
   459  	jobProgressedFn func(context.Context, jobs.HighWaterProgressedFn) error
   460  	// highWaterAtStart is the greater of the job high-water and the timestamp the
   461  	// CHANGEFEED statement was run at. It's used in an assertion that we never
   462  	// regress the job high-water.
   463  	highWaterAtStart hlc.Timestamp
   464  	// passthroughBuf, in some but not all flows, contains changed row data to
   465  	// pass through unchanged to the gateway node.
   466  	passthroughBuf encDatumRowBuffer
   467  	// resolvedBuf, if non-nil, contains rows indicating a changefeed-level
   468  	// resolved timestamp to be returned. It depends on everything in
   469  	// `passthroughBuf` being sent, so that one needs to be emptied first.
   470  	resolvedBuf *encDatumRowBuffer
   471  	// metrics are monitoring counters shared between all changefeeds.
   472  	metrics *Metrics
   473  	// metricsID is used as the unique id of this changefeed in the
   474  	// metrics.MaxBehindNanos map.
   475  	metricsID int
   476  }
   477  
   478  var _ execinfra.Processor = &changeFrontier{}
   479  var _ execinfra.RowSource = &changeFrontier{}
   480  
   481  func newChangeFrontierProcessor(
   482  	flowCtx *execinfra.FlowCtx,
   483  	processorID int32,
   484  	spec execinfrapb.ChangeFrontierSpec,
   485  	input execinfra.RowSource,
   486  	output execinfra.RowReceiver,
   487  ) (execinfra.Processor, error) {
   488  	ctx := flowCtx.EvalCtx.Ctx()
   489  	memMonitor := execinfra.NewMonitor(ctx, flowCtx.EvalCtx.Mon, "changefntr-mem")
   490  	cf := &changeFrontier{
   491  		flowCtx: flowCtx,
   492  		spec:    spec,
   493  		memAcc:  memMonitor.MakeBoundAccount(),
   494  		input:   input,
   495  		sf:      span.MakeFrontier(spec.TrackedSpans...),
   496  	}
   497  	if err := cf.Init(
   498  		cf, &execinfrapb.PostProcessSpec{},
   499  		input.OutputTypes(),
   500  		flowCtx,
   501  		processorID,
   502  		output,
   503  		memMonitor,
   504  		execinfra.ProcStateOpts{
   505  			TrailingMetaCallback: func(context.Context) []execinfrapb.ProducerMetadata {
   506  				cf.close()
   507  				return nil
   508  			},
   509  			InputsToDrain: []execinfra.RowSource{cf.input},
   510  		},
   511  	); err != nil {
   512  		return nil, err
   513  	}
   514  
   515  	if r, ok := cf.spec.Feed.Opts[changefeedbase.OptResolvedTimestamps]; ok {
   516  		var err error
   517  		if r == `` {
   518  			// Empty means emit them as often as we have them.
   519  			cf.freqEmitResolved = emitAllResolved
   520  		} else if cf.freqEmitResolved, err = time.ParseDuration(r); err != nil {
   521  			return nil, err
   522  		}
   523  	} else {
   524  		cf.freqEmitResolved = emitNoResolved
   525  	}
   526  
   527  	var err error
   528  	if cf.encoder, err = getEncoder(spec.Feed.Opts); err != nil {
   529  		return nil, err
   530  	}
   531  
   532  	return cf, nil
   533  }
   534  
   535  func (cf *changeFrontier) OutputTypes() []*types.T {
   536  	return changefeedResultTypes
   537  }
   538  
   539  // Start is part of the RowSource interface.
   540  func (cf *changeFrontier) Start(ctx context.Context) context.Context {
   541  	cf.input.Start(ctx)
   542  
   543  	// StartInternal called at the beginning of the function because there are
   544  	// early returns if errors are detected.
   545  	ctx = cf.StartInternal(ctx, changeFrontierProcName)
   546  
   547  	nodeID, err := cf.flowCtx.EvalCtx.NodeID.OptionalNodeIDErr(48274)
   548  	if err != nil {
   549  		cf.MoveToDraining(err)
   550  		return ctx
   551  	}
   552  	// Pass a nil oracle because this sink is only used to emit resolved timestamps
   553  	// but the oracle is only used when emitting row updates.
   554  	var nilOracle timestampLowerBoundOracle
   555  	if cf.sink, err = getSink(
   556  		ctx, cf.spec.Feed.SinkURI, nodeID, cf.spec.Feed.Opts, cf.spec.Feed.Targets,
   557  		cf.flowCtx.Cfg.Settings, nilOracle, cf.flowCtx.Cfg.ExternalStorageFromURI,
   558  	); err != nil {
   559  		err = MarkRetryableError(err)
   560  		cf.MoveToDraining(err)
   561  		return ctx
   562  	}
   563  
   564  	if b, ok := cf.sink.(*bufferSink); ok {
   565  		cf.resolvedBuf = &b.buf
   566  	}
   567  
   568  	// The job registry has a set of metrics used to monitor the various jobs it
   569  	// runs. They're all stored as the `metric.Struct` interface because of
   570  	// dependency cycles.
   571  	cf.metrics = cf.flowCtx.Cfg.JobRegistry.MetricsStruct().Changefeed.(*Metrics)
   572  	cf.sink = makeMetricsSink(cf.metrics, cf.sink)
   573  	cf.sink = &errorWrapperSink{wrapped: cf.sink}
   574  
   575  	cf.highWaterAtStart = cf.spec.Feed.StatementTime
   576  	if cf.spec.JobID != 0 {
   577  		job, err := cf.flowCtx.Cfg.JobRegistry.LoadJob(ctx, cf.spec.JobID)
   578  		if err != nil {
   579  			cf.MoveToDraining(err)
   580  			return ctx
   581  		}
   582  		cf.jobProgressedFn = job.HighWaterProgressed
   583  
   584  		p := job.Progress()
   585  		if ts := p.GetHighWater(); ts != nil {
   586  			cf.highWaterAtStart.Forward(*ts)
   587  		}
   588  	}
   589  
   590  	cf.metrics.mu.Lock()
   591  	cf.metricsID = cf.metrics.mu.id
   592  	cf.metrics.mu.id++
   593  	cf.metrics.mu.Unlock()
   594  	// TODO(dan): It's very important that we de-register from the metric because
   595  	// if we orphan an entry in there, our monitoring will lie (say the changefeed
   596  	// is behind when it may not be). We call this in `close` but that doesn't
   597  	// always get called when the processor is shut down (especially during crdb
   598  	// chaos), so here's something that maybe will work some of the times that
   599  	// close doesn't. This is all very hacky. The real answer is to fix whatever
   600  	// bugs currently exist in processor shutdown.
   601  	go func() {
   602  		<-ctx.Done()
   603  		cf.closeMetrics()
   604  	}()
   605  
   606  	return ctx
   607  }
   608  
   609  func (cf *changeFrontier) close() {
   610  	if cf.InternalClose() {
   611  		if cf.metrics != nil {
   612  			cf.closeMetrics()
   613  		}
   614  		if cf.sink != nil {
   615  			if err := cf.sink.Close(); err != nil {
   616  				log.Warningf(cf.Ctx, `error closing sink. goroutines may have leaked: %v`, err)
   617  			}
   618  		}
   619  		cf.memAcc.Close(cf.Ctx)
   620  		cf.MemMonitor.Stop(cf.Ctx)
   621  	}
   622  }
   623  
   624  // closeMetrics de-registers from the progress registry that powers
   625  // `changefeed.max_behind_nanos`. This method is idempotent.
   626  func (cf *changeFrontier) closeMetrics() {
   627  	// Delete this feed from the MaxBehindNanos metric so it's no longer
   628  	// considered by the gauge.
   629  	cf.metrics.mu.Lock()
   630  	delete(cf.metrics.mu.resolved, cf.metricsID)
   631  	cf.metricsID = -1
   632  	cf.metrics.mu.Unlock()
   633  }
   634  
   635  // schemaChangeBoundaryReached returns true if the spanFrontier is at the
   636  // current schemaChangeBoundary.
   637  func (cf *changeFrontier) schemaChangeBoundaryReached() (r bool) {
   638  	return !cf.schemaChangeBoundary.IsEmpty() && cf.schemaChangeBoundary.Equal(cf.sf.Frontier())
   639  }
   640  
   641  // shouldFailOnSchemaChange checks the job's spec to determine whether it should
   642  // failed on schema change events after all spans have been resolved.
   643  func (cf *changeFrontier) shouldFailOnSchemaChange() bool {
   644  	policy := changefeedbase.SchemaChangePolicy(cf.spec.Feed.Opts[changefeedbase.OptSchemaChangePolicy])
   645  	return policy == changefeedbase.OptSchemaChangePolicyStop
   646  }
   647  
   648  // shouldFailOnSchemaChange checks the job's spec to determine whether it should
   649  // install protected timestamps when encountering scan boundaries.
   650  func (cf *changeFrontier) shouldProtectBoundaries() bool {
   651  	policy := changefeedbase.SchemaChangePolicy(cf.spec.Feed.Opts[changefeedbase.OptSchemaChangePolicy])
   652  	return policy == changefeedbase.OptSchemaChangePolicyBackfill
   653  }
   654  
   655  // Next is part of the RowSource interface.
   656  func (cf *changeFrontier) Next() (sqlbase.EncDatumRow, *execinfrapb.ProducerMetadata) {
   657  	for cf.State == execinfra.StateRunning {
   658  		if !cf.passthroughBuf.IsEmpty() {
   659  			return cf.passthroughBuf.Pop(), nil
   660  		} else if !cf.resolvedBuf.IsEmpty() {
   661  			return cf.resolvedBuf.Pop(), nil
   662  		}
   663  
   664  		if cf.schemaChangeBoundaryReached() && cf.shouldFailOnSchemaChange() {
   665  			// TODO(ajwerner): make this more useful by at least informing the client
   666  			// of which tables changed.
   667  			cf.MoveToDraining(pgerror.Newf(pgcode.SchemaChangeOccurred,
   668  				"schema change occurred at %v", cf.schemaChangeBoundary.Next().AsOfSystemTime()))
   669  			break
   670  		}
   671  
   672  		row, meta := cf.input.Next()
   673  		if meta != nil {
   674  			if meta.Err != nil {
   675  				cf.MoveToDraining(nil /* err */)
   676  			}
   677  			return nil, meta
   678  		}
   679  		if row == nil {
   680  			cf.MoveToDraining(nil /* err */)
   681  			break
   682  		}
   683  
   684  		if row[0].IsNull() {
   685  			// In changefeeds with a sink, this will never happen. But in the
   686  			// core changefeed, which returns changed rows directly via pgwire,
   687  			// a row with a null resolved_span field is a changed row that needs
   688  			// to be forwarded to the gateway.
   689  			cf.passthroughBuf.Push(row)
   690  			continue
   691  		}
   692  
   693  		if err := cf.noteResolvedSpan(row[0]); err != nil {
   694  			cf.MoveToDraining(err)
   695  			break
   696  		}
   697  	}
   698  	return nil, cf.DrainHelper()
   699  }
   700  
   701  func (cf *changeFrontier) noteResolvedSpan(d sqlbase.EncDatum) error {
   702  	if err := d.EnsureDecoded(changefeedResultTypes[0], &cf.a); err != nil {
   703  		return err
   704  	}
   705  	raw, ok := d.Datum.(*tree.DBytes)
   706  	if !ok {
   707  		return errors.AssertionFailedf(`unexpected datum type %T: %s`, d.Datum, d.Datum)
   708  	}
   709  	var resolved jobspb.ResolvedSpan
   710  	if err := protoutil.Unmarshal([]byte(*raw), &resolved); err != nil {
   711  		return errors.NewAssertionErrorWithWrappedErrf(err,
   712  			`unmarshalling resolved span: %x`, raw)
   713  	}
   714  
   715  	// Inserting a timestamp less than the one the changefeed flow started at
   716  	// could potentially regress the job progress. This is not expected, but it
   717  	// was a bug at one point, so assert to prevent regressions.
   718  	//
   719  	// TODO(dan): This is much more naturally expressed as an assert inside the
   720  	// job progress update closure, but it currently doesn't pass along the info
   721  	// we'd need to do it that way.
   722  	if !resolved.Timestamp.IsEmpty() && resolved.Timestamp.Less(cf.highWaterAtStart) {
   723  		log.ReportOrPanic(cf.Ctx, &cf.flowCtx.Cfg.Settings.SV,
   724  			`got a span level timestamp %s for %s that is less than the initial high-water %s`,
   725  			log.Safe(resolved.Timestamp), resolved.Span, log.Safe(cf.highWaterAtStart))
   726  		return nil
   727  	}
   728  
   729  	// We want to ensure that we mark the schemaChangeBoundary and then we want to detect when
   730  	// the frontier reaches to or past the schemaChangeBoundary.
   731  	if resolved.BoundaryReached && (cf.schemaChangeBoundary.IsEmpty() || resolved.Timestamp.Less(cf.schemaChangeBoundary)) {
   732  		cf.schemaChangeBoundary = resolved.Timestamp
   733  	}
   734  	// If we've moved past a schemaChangeBoundary, make sure to clear it.
   735  	if !resolved.BoundaryReached && !cf.schemaChangeBoundary.IsEmpty() && cf.schemaChangeBoundary.Less(resolved.Timestamp) {
   736  		cf.schemaChangeBoundary = hlc.Timestamp{}
   737  	}
   738  
   739  	frontierChanged := cf.sf.Forward(resolved.Span, resolved.Timestamp)
   740  	isBehind := cf.maybeLogBehindSpan(frontierChanged)
   741  	if frontierChanged {
   742  		if err := cf.handleFrontierChanged(isBehind); err != nil {
   743  			return err
   744  		}
   745  	}
   746  	return nil
   747  }
   748  
   749  func (cf *changeFrontier) handleFrontierChanged(isBehind bool) error {
   750  	newResolved := cf.sf.Frontier()
   751  	cf.metrics.mu.Lock()
   752  	if cf.metricsID != -1 {
   753  		cf.metrics.mu.resolved[cf.metricsID] = newResolved
   754  	}
   755  	cf.metrics.mu.Unlock()
   756  	if err := cf.checkpointResolvedTimestamp(newResolved, isBehind); err != nil {
   757  		return err
   758  	}
   759  	if err := cf.maybeEmitResolved(newResolved); err != nil {
   760  		return err
   761  	}
   762  	return nil
   763  }
   764  
   765  // checkpointResolvedTimestamp checkpoints a changefeed-level resolved timestamp
   766  // to the jobs record. It additionally manages the protected timestamp state
   767  // which is stored in the job progress details. It is only called if the new
   768  // resolved timestamp is later than the current one. The isBehind argument is
   769  // used to determine whether an existing protected timestamp should be released.
   770  func (cf *changeFrontier) checkpointResolvedTimestamp(
   771  	resolved hlc.Timestamp, isBehind bool,
   772  ) (err error) {
   773  	// NB: Sinkless changefeeds will not have a jobProgressedFn. In fact, they
   774  	// have no distributed state whatsoever. Because of this they also do not
   775  	// use protected timestamps.
   776  	if cf.jobProgressedFn == nil {
   777  		return nil
   778  	}
   779  	return cf.jobProgressedFn(cf.Ctx, func(
   780  		ctx context.Context, txn *kv.Txn, details jobspb.ProgressDetails,
   781  	) (hlc.Timestamp, error) {
   782  		progress := details.(*jobspb.Progress_Changefeed).Changefeed
   783  		if err := cf.manageProtectedTimestamps(ctx, progress, txn, resolved, isBehind); err != nil {
   784  			return hlc.Timestamp{}, err
   785  		}
   786  		return resolved, nil
   787  	})
   788  }
   789  
   790  // manageProtectedTimestamps is called when the resolved timestamp is being
   791  // checkpointed. The changeFrontier always checkpoints resolved timestamps
   792  // which occur at scan boundaries. It releases previously protected timestamps
   793  // if the changefeed is not behind. See maybeLogBehindSpan for details on the
   794  // behind calculation.
   795  //
   796  // Note that this function is never called for sinkless changefeeds as they have
   797  // no corresponding job and thus no corresponding distributed state on which to
   798  // attach protected timestamp information.
   799  //
   800  // TODO(ajwerner): Adopt protected timestamps for sinkless changefeeds,
   801  // perhaps by using whatever mechanism is eventually built to protect
   802  // data for long-running SQL transactions. There's some discussion of this
   803  // use case in the protected timestamps RFC.
   804  func (cf *changeFrontier) manageProtectedTimestamps(
   805  	ctx context.Context,
   806  	progress *jobspb.ChangefeedProgress,
   807  	txn *kv.Txn,
   808  	resolved hlc.Timestamp,
   809  	isBehind bool,
   810  ) error {
   811  	pts := cf.flowCtx.Cfg.ProtectedTimestampProvider
   812  	if err := cf.maybeReleaseProtectedTimestamp(ctx, progress, pts, txn, isBehind); err != nil {
   813  		return err
   814  	}
   815  	return cf.maybeProtectTimestamp(ctx, progress, pts, txn, resolved)
   816  }
   817  
   818  // maybeReleaseProtectedTimestamp will release the current protected timestamp
   819  // if either the resolved timestamp is close to the present or we've reached
   820  // a new schemaChangeBoundary which will be protected.
   821  func (cf *changeFrontier) maybeReleaseProtectedTimestamp(
   822  	ctx context.Context,
   823  	progress *jobspb.ChangefeedProgress,
   824  	pts protectedts.Storage,
   825  	txn *kv.Txn,
   826  	isBehind bool,
   827  ) error {
   828  	if progress.ProtectedTimestampRecord == uuid.Nil {
   829  		return nil
   830  	}
   831  	if !cf.schemaChangeBoundaryReached() && isBehind {
   832  		log.VEventf(ctx, 2, "not releasing protected timestamp because changefeed is behind")
   833  		return nil
   834  	}
   835  	log.VEventf(ctx, 2, "releasing protected timestamp %v",
   836  		progress.ProtectedTimestampRecord)
   837  	if err := pts.Release(ctx, txn, progress.ProtectedTimestampRecord); err != nil {
   838  		return err
   839  	}
   840  	progress.ProtectedTimestampRecord = uuid.Nil
   841  	return nil
   842  }
   843  
   844  // maybeProtectTimestamp creates a new protected timestamp when the
   845  // changeFrontier reaches a scanBoundary and the schemaChangePolicy indicates
   846  // that we should perform a backfill (see cf.shouldProtectBoundaries()).
   847  func (cf *changeFrontier) maybeProtectTimestamp(
   848  	ctx context.Context,
   849  	progress *jobspb.ChangefeedProgress,
   850  	pts protectedts.Storage,
   851  	txn *kv.Txn,
   852  	resolved hlc.Timestamp,
   853  ) error {
   854  	if cf.isSinkless() || !cf.schemaChangeBoundaryReached() || !cf.shouldProtectBoundaries() {
   855  		return nil
   856  	}
   857  
   858  	jobID := cf.spec.JobID
   859  	targets := cf.spec.Feed.Targets
   860  	return createProtectedTimestampRecord(ctx, pts, txn, jobID, targets, resolved, progress)
   861  }
   862  
   863  func (cf *changeFrontier) maybeEmitResolved(newResolved hlc.Timestamp) error {
   864  	if cf.freqEmitResolved == emitNoResolved {
   865  		return nil
   866  	}
   867  	sinceEmitted := newResolved.GoTime().Sub(cf.lastEmitResolved)
   868  	shouldEmit := sinceEmitted >= cf.freqEmitResolved || cf.schemaChangeBoundaryReached()
   869  	if !shouldEmit {
   870  		return nil
   871  	}
   872  	// Keeping this after the checkpointResolvedTimestamp call will avoid
   873  	// some duplicates if a restart happens.
   874  	if err := emitResolvedTimestamp(cf.Ctx, cf.encoder, cf.sink, newResolved); err != nil {
   875  		return err
   876  	}
   877  	cf.lastEmitResolved = newResolved.GoTime()
   878  	return nil
   879  }
   880  
   881  // Potentially log the most behind span in the frontier for debugging. The
   882  // returned boolean will be true if the resolved timestamp lags far behind the
   883  // present as defined by the current configuration.
   884  func (cf *changeFrontier) maybeLogBehindSpan(frontierChanged bool) (isBehind bool) {
   885  	// These two cluster setting values represent the target responsiveness of
   886  	// poller and range feed. The cluster setting for switching between poller and
   887  	// rangefeed is only checked at changefeed start/resume, so instead of
   888  	// switching on it here, just add them. Also add 1 second in case both these
   889  	// settings are set really low (as they are in unit tests).
   890  	pollInterval := changefeedbase.TableDescriptorPollInterval.Get(&cf.flowCtx.Cfg.Settings.SV)
   891  	closedtsInterval := closedts.TargetDuration.Get(&cf.flowCtx.Cfg.Settings.SV)
   892  	slownessThreshold := time.Second + 10*(pollInterval+closedtsInterval)
   893  	frontier := cf.sf.Frontier()
   894  	now := timeutil.Now()
   895  	resolvedBehind := now.Sub(frontier.GoTime())
   896  	if resolvedBehind <= slownessThreshold {
   897  		return false
   898  	}
   899  
   900  	description := `sinkless feed`
   901  	if !cf.isSinkless() {
   902  		description = fmt.Sprintf("job %d", cf.spec.JobID)
   903  	}
   904  	if frontierChanged {
   905  		log.Infof(cf.Ctx, "%s new resolved timestamp %s is behind by %s",
   906  			description, frontier, resolvedBehind)
   907  	}
   908  	const slowSpanMaxFrequency = 10 * time.Second
   909  	if now.Sub(cf.lastSlowSpanLog) > slowSpanMaxFrequency {
   910  		cf.lastSlowSpanLog = now
   911  		s := cf.sf.PeekFrontierSpan()
   912  		log.Infof(cf.Ctx, "%s span %s is behind by %s", description, s, resolvedBehind)
   913  	}
   914  	return true
   915  }
   916  
   917  // ConsumerDone is part of the RowSource interface.
   918  func (cf *changeFrontier) ConsumerDone() {
   919  	cf.MoveToDraining(nil /* err */)
   920  }
   921  
   922  // ConsumerClosed is part of the RowSource interface.
   923  func (cf *changeFrontier) ConsumerClosed() {
   924  	// The consumer is done, Next() will not be called again.
   925  	cf.InternalClose()
   926  }
   927  
   928  // isSinkless returns true if this changeFrontier is sinkless and thus does not
   929  // have a job.
   930  func (cf *changeFrontier) isSinkless() bool {
   931  	return cf.spec.JobID == 0
   932  }