github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/ccl/changefeedccl/changefeed_dist.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Licensed as a CockroachDB Enterprise file under the Cockroach Community
     4  // License (the "License"); you may not use this file except in compliance with
     5  // the License. You may obtain a copy of the License at
     6  //
     7  //     https://github.com/cockroachdb/cockroach/blob/master/licenses/CCL.txt
     8  
     9  package changefeedccl
    10  
    11  import (
    12  	"context"
    13  
    14  	"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
    15  	"github.com/cockroachdb/cockroach/pkg/keys"
    16  	"github.com/cockroachdb/cockroach/pkg/kv"
    17  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    18  	"github.com/cockroachdb/cockroach/pkg/sql"
    19  	"github.com/cockroachdb/cockroach/pkg/sql/execinfrapb"
    20  	"github.com/cockroachdb/cockroach/pkg/sql/physicalplan"
    21  	"github.com/cockroachdb/cockroach/pkg/sql/rowexec"
    22  	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
    23  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    24  	"github.com/cockroachdb/cockroach/pkg/sql/types"
    25  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    26  )
    27  
    28  func init() {
    29  	rowexec.NewChangeAggregatorProcessor = newChangeAggregatorProcessor
    30  	rowexec.NewChangeFrontierProcessor = newChangeFrontierProcessor
    31  }
    32  
    33  const (
    34  	changeAggregatorProcName = `changeagg`
    35  	changeFrontierProcName   = `changefntr`
    36  )
    37  
    38  var changefeedResultTypes = []*types.T{
    39  	types.Bytes,  // resolved span
    40  	types.String, // topic
    41  	types.Bytes,  // key
    42  	types.Bytes,  // value
    43  }
    44  
    45  // distChangefeedFlow plans and runs a distributed changefeed.
    46  //
    47  // One or more ChangeAggregator processors watch table data for changes. These
    48  // transform the changed kvs into changed rows and either emit them to a sink
    49  // (such as kafka) or, if there is no sink, forward them in columns 1,2,3 (where
    50  // they will be eventually returned directly via pgwire). In either case,
    51  // periodically a span will become resolved as of some timestamp, meaning that
    52  // no new rows will ever be emitted at or below that timestamp. These span-level
    53  // resolved timestamps are emitted as a marshaled `jobspb.ResolvedSpan` proto in
    54  // column 0.
    55  //
    56  // The flow will always have exactly one ChangeFrontier processor which all the
    57  // ChangeAggregators feed into. It collects all span-level resolved timestamps
    58  // and aggregates them into a changefeed-level resolved timestamp, which is the
    59  // minimum of the span-level resolved timestamps. This changefeed-level resolved
    60  // timestamp is emitted into the changefeed sink (or returned to the gateway if
    61  // there is no sink) whenever it advances. ChangeFrontier also updates the
    62  // progress of the changefeed's corresponding system job.
    63  func distChangefeedFlow(
    64  	ctx context.Context,
    65  	phs sql.PlanHookState,
    66  	jobID int64,
    67  	details jobspb.ChangefeedDetails,
    68  	progress jobspb.Progress,
    69  	resultsCh chan<- tree.Datums,
    70  ) error {
    71  	var err error
    72  	details, err = validateDetails(details)
    73  	if err != nil {
    74  		return err
    75  	}
    76  
    77  	// NB: A non-empty high water indicates that we have checkpointed a resolved
    78  	// timestamp. Skipping the initial scan is equivalent to starting the
    79  	// changefeed from a checkpoint at its start time. Initialize the progress
    80  	// based on whether we should perform an initial scan.
    81  	{
    82  		h := progress.GetHighWater()
    83  		noHighWater := (h == nil || *h == (hlc.Timestamp{}))
    84  		// We want to set the highWater and thus avoid an initial scan if either
    85  		// this is a cursor and there was no request for one, or we don't have a
    86  		// cursor but we have a request to not have an initial scan.
    87  		if noHighWater && !initialScanFromOptions(details.Opts) {
    88  			// If there is a cursor, the statement time has already been set to it.
    89  			progress.Progress = &jobspb.Progress_HighWater{HighWater: &details.StatementTime}
    90  		}
    91  	}
    92  
    93  	spansTS := details.StatementTime
    94  	var initialHighWater hlc.Timestamp
    95  	if h := progress.GetHighWater(); h != nil && *h != (hlc.Timestamp{}) {
    96  		initialHighWater = *h
    97  		// If we have a high-water set, use it to compute the spans, since the
    98  		// ones at the statement time may have been garbage collected by now.
    99  		spansTS = initialHighWater
   100  	}
   101  
   102  	execCfg := phs.ExecCfg()
   103  	trackedSpans, err := fetchSpansForTargets(ctx, execCfg.DB, execCfg.Codec, details.Targets, spansTS)
   104  	if err != nil {
   105  		return err
   106  	}
   107  
   108  	// Changefeed flows handle transactional consistency themselves.
   109  	var noTxn *kv.Txn
   110  	gatewayNodeID, err := execCfg.NodeID.OptionalNodeIDErr(48274)
   111  	if err != nil {
   112  		return err
   113  	}
   114  	dsp := phs.DistSQLPlanner()
   115  	evalCtx := phs.ExtendedEvalContext()
   116  	planCtx := dsp.NewPlanningCtx(ctx, evalCtx, noTxn, true /* distribute */)
   117  
   118  	var spanPartitions []sql.SpanPartition
   119  	if details.SinkURI == `` {
   120  		// Sinkless feeds get one ChangeAggregator on the gateway.
   121  		spanPartitions = []sql.SpanPartition{{Node: gatewayNodeID, Spans: trackedSpans}}
   122  	} else {
   123  		// All other feeds get a ChangeAggregator local on the leaseholder.
   124  		spanPartitions, err = dsp.PartitionSpans(planCtx, trackedSpans)
   125  		if err != nil {
   126  			return err
   127  		}
   128  	}
   129  
   130  	changeAggregatorProcs := make([]physicalplan.Processor, 0, len(spanPartitions))
   131  	for _, sp := range spanPartitions {
   132  		// TODO(dan): Merge these watches with the span-level resolved
   133  		// timestamps from the job progress.
   134  		watches := make([]execinfrapb.ChangeAggregatorSpec_Watch, len(sp.Spans))
   135  		for i, nodeSpan := range sp.Spans {
   136  			watches[i] = execinfrapb.ChangeAggregatorSpec_Watch{
   137  				Span:            nodeSpan,
   138  				InitialResolved: initialHighWater,
   139  			}
   140  		}
   141  
   142  		changeAggregatorProcs = append(changeAggregatorProcs, physicalplan.Processor{
   143  			Node: sp.Node,
   144  			Spec: execinfrapb.ProcessorSpec{
   145  				Core: execinfrapb.ProcessorCoreUnion{
   146  					ChangeAggregator: &execinfrapb.ChangeAggregatorSpec{
   147  						Watches: watches,
   148  						Feed:    details,
   149  					},
   150  				},
   151  				Output: []execinfrapb.OutputRouterSpec{{Type: execinfrapb.OutputRouterSpec_PASS_THROUGH}},
   152  			},
   153  		})
   154  	}
   155  	// NB: This SpanFrontier processor depends on the set of tracked spans being
   156  	// static. Currently there is no way for them to change after the changefeed
   157  	// is created, even if it is paused and unpaused, but #28982 describes some
   158  	// ways that this might happen in the future.
   159  	changeFrontierSpec := execinfrapb.ChangeFrontierSpec{
   160  		TrackedSpans: trackedSpans,
   161  		Feed:         details,
   162  		JobID:        jobID,
   163  	}
   164  
   165  	var p sql.PhysicalPlan
   166  
   167  	stageID := p.NewStageID()
   168  	p.ResultRouters = make([]physicalplan.ProcessorIdx, len(changeAggregatorProcs))
   169  	for i, proc := range changeAggregatorProcs {
   170  		proc.Spec.StageID = stageID
   171  		pIdx := p.AddProcessor(proc)
   172  		p.ResultRouters[i] = pIdx
   173  	}
   174  
   175  	p.AddSingleGroupStage(
   176  		gatewayNodeID,
   177  		execinfrapb.ProcessorCoreUnion{ChangeFrontier: &changeFrontierSpec},
   178  		execinfrapb.PostProcessSpec{},
   179  		changefeedResultTypes,
   180  	)
   181  
   182  	p.ResultTypes = changefeedResultTypes
   183  	p.PlanToStreamColMap = []int{1, 2, 3}
   184  	dsp.FinalizePlan(planCtx, &p)
   185  
   186  	resultRows := makeChangefeedResultWriter(resultsCh)
   187  	recv := sql.MakeDistSQLReceiver(
   188  		ctx,
   189  		resultRows,
   190  		tree.Rows,
   191  		execCfg.RangeDescriptorCache,
   192  		execCfg.LeaseHolderCache,
   193  		noTxn,
   194  		func(ts hlc.Timestamp) {},
   195  		evalCtx.Tracing,
   196  	)
   197  	defer recv.Release()
   198  
   199  	var finishedSetupFn func()
   200  	if details.SinkURI != `` {
   201  		// We abuse the job's results channel to make CREATE CHANGEFEED wait for
   202  		// this before returning to the user to ensure the setup went okay. Job
   203  		// resumption doesn't have the same hack, but at the moment ignores
   204  		// results and so is currently okay. Return nil instead of anything
   205  		// meaningful so that if we start doing anything with the results
   206  		// returned by resumed jobs, then it breaks instead of returning
   207  		// nonsense.
   208  		finishedSetupFn = func() { resultsCh <- tree.Datums(nil) }
   209  	}
   210  
   211  	// Copy the evalCtx, as dsp.Run() might change it.
   212  	evalCtxCopy := *evalCtx
   213  	dsp.Run(planCtx, noTxn, &p, recv, &evalCtxCopy, finishedSetupFn)()
   214  	return resultRows.Err()
   215  }
   216  
   217  func fetchSpansForTargets(
   218  	ctx context.Context,
   219  	db *kv.DB,
   220  	codec keys.SQLCodec,
   221  	targets jobspb.ChangefeedTargets,
   222  	ts hlc.Timestamp,
   223  ) ([]roachpb.Span, error) {
   224  	var spans []roachpb.Span
   225  	err := db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
   226  		spans = nil
   227  		txn.SetFixedTimestamp(ctx, ts)
   228  		// Note that all targets are currently guaranteed to be tables.
   229  		for tableID := range targets {
   230  			tableDesc, err := sqlbase.GetTableDescFromID(ctx, txn, codec, tableID)
   231  			if err != nil {
   232  				return err
   233  			}
   234  			spans = append(spans, tableDesc.PrimaryIndexSpan(codec))
   235  		}
   236  		return nil
   237  	})
   238  	return spans, err
   239  }
   240  
   241  // changefeedResultWriter implements the `rowexec.resultWriter` that sends
   242  // the received rows back over the given channel.
   243  type changefeedResultWriter struct {
   244  	rowsCh       chan<- tree.Datums
   245  	rowsAffected int
   246  	err          error
   247  }
   248  
   249  func makeChangefeedResultWriter(rowsCh chan<- tree.Datums) *changefeedResultWriter {
   250  	return &changefeedResultWriter{rowsCh: rowsCh}
   251  }
   252  
   253  func (w *changefeedResultWriter) AddRow(ctx context.Context, row tree.Datums) error {
   254  	// Copy the row because it's not guaranteed to exist after this function
   255  	// returns.
   256  	row = append(tree.Datums(nil), row...)
   257  
   258  	select {
   259  	case <-ctx.Done():
   260  		return ctx.Err()
   261  	case w.rowsCh <- row:
   262  		return nil
   263  	}
   264  }
   265  func (w *changefeedResultWriter) IncrementRowsAffected(n int) {
   266  	w.rowsAffected += n
   267  }
   268  func (w *changefeedResultWriter) SetError(err error) {
   269  	w.err = err
   270  }
   271  func (w *changefeedResultWriter) Err() error {
   272  	return w.err
   273  }