github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/ccl/changefeedccl/changefeed_stmt.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Licensed as a CockroachDB Enterprise file under the Cockroach Community
     4  // License (the "License"); you may not use this file except in compliance with
     5  // the License. You may obtain a copy of the License at
     6  //
     7  //     https://github.com/cockroachdb/cockroach/blob/master/licenses/CCL.txt
     8  
     9  package changefeedccl
    10  
    11  import (
    12  	"context"
    13  	"encoding/hex"
    14  	"fmt"
    15  	"math/rand"
    16  	"net/url"
    17  	"sort"
    18  	"time"
    19  
    20  	"github.com/cockroachdb/cockroach/pkg/ccl/backupccl"
    21  	"github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/changefeedbase"
    22  	"github.com/cockroachdb/cockroach/pkg/ccl/utilccl"
    23  	"github.com/cockroachdb/cockroach/pkg/jobs"
    24  	"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
    25  	"github.com/cockroachdb/cockroach/pkg/jobs/jobsprotectedts"
    26  	"github.com/cockroachdb/cockroach/pkg/keys"
    27  	"github.com/cockroachdb/cockroach/pkg/kv"
    28  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/protectedts"
    29  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    30  	"github.com/cockroachdb/cockroach/pkg/server/telemetry"
    31  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    32  	"github.com/cockroachdb/cockroach/pkg/sql"
    33  	"github.com/cockroachdb/cockroach/pkg/sql/flowinfra"
    34  	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
    35  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    36  	"github.com/cockroachdb/cockroach/pkg/sql/types"
    37  	"github.com/cockroachdb/cockroach/pkg/storage/cloud"
    38  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    39  	"github.com/cockroachdb/cockroach/pkg/util/log"
    40  	"github.com/cockroachdb/cockroach/pkg/util/retry"
    41  	"github.com/cockroachdb/cockroach/pkg/util/tracing"
    42  	"github.com/cockroachdb/cockroach/pkg/util/uuid"
    43  	"github.com/cockroachdb/errors"
    44  	crdberrors "github.com/cockroachdb/errors"
    45  )
    46  
    47  func init() {
    48  	sql.AddPlanHook(changefeedPlanHook)
    49  	jobs.RegisterConstructor(
    50  		jobspb.TypeChangefeed,
    51  		func(job *jobs.Job, _ *cluster.Settings) jobs.Resumer {
    52  			return &changefeedResumer{job: job}
    53  		},
    54  	)
    55  }
    56  
    57  // changefeedPlanHook implements sql.PlanHookFn.
    58  func changefeedPlanHook(
    59  	ctx context.Context, stmt tree.Statement, p sql.PlanHookState,
    60  ) (sql.PlanHookRowFn, sqlbase.ResultColumns, []sql.PlanNode, bool, error) {
    61  	changefeedStmt, ok := stmt.(*tree.CreateChangefeed)
    62  	if !ok {
    63  		return nil, nil, nil, false, nil
    64  	}
    65  
    66  	var sinkURIFn func() (string, error)
    67  	var header sqlbase.ResultColumns
    68  	unspecifiedSink := changefeedStmt.SinkURI == nil
    69  	avoidBuffering := false
    70  	if unspecifiedSink {
    71  		// An unspecified sink triggers a fairly radical change in behavior.
    72  		// Instead of setting up a system.job to emit to a sink in the
    73  		// background and returning immediately with the job ID, the `CREATE
    74  		// CHANGEFEED` blocks forever and returns all changes as rows directly
    75  		// over pgwire. The types of these rows are `(topic STRING, key BYTES,
    76  		// value BYTES)` and they correspond exactly to what would be emitted to
    77  		// a sink.
    78  		sinkURIFn = func() (string, error) { return ``, nil }
    79  		header = sqlbase.ResultColumns{
    80  			{Name: "table", Typ: types.String},
    81  			{Name: "key", Typ: types.Bytes},
    82  			{Name: "value", Typ: types.Bytes},
    83  		}
    84  		avoidBuffering = true
    85  	} else {
    86  		var err error
    87  		sinkURIFn, err = p.TypeAsString(ctx, changefeedStmt.SinkURI, `CREATE CHANGEFEED`)
    88  		if err != nil {
    89  			return nil, nil, nil, false, err
    90  		}
    91  		header = sqlbase.ResultColumns{
    92  			{Name: "job_id", Typ: types.Int},
    93  		}
    94  	}
    95  
    96  	optsFn, err := p.TypeAsStringOpts(ctx, changefeedStmt.Options, changefeedbase.ChangefeedOptionExpectValues)
    97  	if err != nil {
    98  		return nil, nil, nil, false, err
    99  	}
   100  
   101  	fn := func(ctx context.Context, _ []sql.PlanNode, resultsCh chan<- tree.Datums) error {
   102  		ctx, span := tracing.ChildSpan(ctx, stmt.StatementTag())
   103  		defer tracing.FinishSpan(span)
   104  
   105  		if err := p.RequireAdminRole(ctx, "CREATE CHANGEFEED"); err != nil {
   106  			return err
   107  		}
   108  
   109  		sinkURI, err := sinkURIFn()
   110  		if err != nil {
   111  			return err
   112  		}
   113  		if !unspecifiedSink && sinkURI == `` {
   114  			// Error if someone specifies an INTO with the empty string. We've
   115  			// already sent the wrong result column headers.
   116  			return errors.New(`omit the SINK clause for inline results`)
   117  		}
   118  
   119  		opts, err := optsFn()
   120  		if err != nil {
   121  			return err
   122  		}
   123  
   124  		jobDescription, err := changefeedJobDescription(p, changefeedStmt, sinkURI, opts)
   125  		if err != nil {
   126  			return err
   127  		}
   128  
   129  		statementTime := hlc.Timestamp{
   130  			WallTime: p.ExtendedEvalContext().GetStmtTimestamp().UnixNano(),
   131  		}
   132  		var initialHighWater hlc.Timestamp
   133  		if cursor, ok := opts[changefeedbase.OptCursor]; ok {
   134  			asOf := tree.AsOfClause{Expr: tree.NewStrVal(cursor)}
   135  			var err error
   136  			if initialHighWater, err = p.EvalAsOfTimestamp(ctx, asOf); err != nil {
   137  				return err
   138  			}
   139  			statementTime = initialHighWater
   140  		}
   141  
   142  		// For now, disallow targeting a database or wildcard table selection.
   143  		// Getting it right as tables enter and leave the set over time is
   144  		// tricky.
   145  		if len(changefeedStmt.Targets.Databases) > 0 {
   146  			return errors.Errorf(`CHANGEFEED cannot target %s`,
   147  				tree.AsString(&changefeedStmt.Targets))
   148  		}
   149  		for _, t := range changefeedStmt.Targets.Tables {
   150  			p, err := t.NormalizeTablePattern()
   151  			if err != nil {
   152  				return err
   153  			}
   154  			if _, ok := p.(*tree.TableName); !ok {
   155  				return errors.Errorf(`CHANGEFEED cannot target %s`, tree.AsString(t))
   156  			}
   157  		}
   158  
   159  		// This grabs table descriptors once to get their ids.
   160  		targetDescs, _, err := backupccl.ResolveTargetsToDescriptors(
   161  			ctx, p, statementTime, changefeedStmt.Targets, tree.RequestedDescriptors)
   162  		if err != nil {
   163  			return err
   164  		}
   165  		targets := make(jobspb.ChangefeedTargets, len(targetDescs))
   166  		for _, desc := range targetDescs {
   167  			if tableDesc := desc.Table(hlc.Timestamp{}); tableDesc != nil {
   168  				targets[tableDesc.ID] = jobspb.ChangefeedTarget{
   169  					StatementTimeName: tableDesc.Name,
   170  				}
   171  				if err := validateChangefeedTable(targets, tableDesc); err != nil {
   172  					return err
   173  				}
   174  			}
   175  		}
   176  
   177  		details := jobspb.ChangefeedDetails{
   178  			Targets:       targets,
   179  			Opts:          opts,
   180  			SinkURI:       sinkURI,
   181  			StatementTime: statementTime,
   182  		}
   183  		progress := jobspb.Progress{
   184  			Progress: &jobspb.Progress_HighWater{},
   185  			Details: &jobspb.Progress_Changefeed{
   186  				Changefeed: &jobspb.ChangefeedProgress{},
   187  			},
   188  		}
   189  
   190  		// TODO(dan): In an attempt to present the most helpful error message to the
   191  		// user, the ordering requirements between all these usage validations have
   192  		// become extremely fragile and non-obvious.
   193  		//
   194  		// - `validateDetails` has to run first to fill in defaults for `envelope`
   195  		//   and `format` if the user didn't specify them.
   196  		// - Then `getEncoder` is run to return any configuration errors.
   197  		// - Then the changefeed is opted in to `OptKeyInValue` for any cloud
   198  		//   storage sink. Kafka etc have a key and value field in each message but
   199  		//   cloud storage sinks don't have anywhere to put the key. So if the key
   200  		//   is not in the value, then for DELETEs there is no way to recover which
   201  		//   key was deleted. We could make the user explicitly pass this option for
   202  		//   every cloud storage sink and error if they don't, but that seems
   203  		//   user-hostile for insufficient reason. We can't do this any earlier,
   204  		//   because we might return errors about `key_in_value` being incompatible
   205  		//   which is confusing when the user didn't type that option.
   206  		// - Finally, we create a "canary" sink to test sink configuration and
   207  		//   connectivity. This has to go last because it is strange to return sink
   208  		//   connectivity errors before we've finished validating all the other
   209  		//   options. We should probably split sink configuration checking and sink
   210  		//   connectivity checking into separate methods.
   211  		//
   212  		// The only upside in all this nonsense is the tests are decent. I've tuned
   213  		// this particular order simply by rearranging stuff until the changefeedccl
   214  		// tests all pass.
   215  		parsedSink, err := url.Parse(sinkURI)
   216  		if err != nil {
   217  			return err
   218  		}
   219  		if details, err = validateDetails(details); err != nil {
   220  			return err
   221  		}
   222  
   223  		if _, err := getEncoder(details.Opts); err != nil {
   224  			return err
   225  		}
   226  		if isCloudStorageSink(parsedSink) {
   227  			details.Opts[changefeedbase.OptKeyInValue] = ``
   228  		}
   229  
   230  		// Feature telemetry
   231  		telemetrySink := parsedSink.Scheme
   232  		if telemetrySink == `` {
   233  			telemetrySink = `sinkless`
   234  		}
   235  		telemetry.Count(`changefeed.create.sink.` + telemetrySink)
   236  		telemetry.Count(`changefeed.create.format.` + details.Opts[changefeedbase.OptFormat])
   237  		telemetry.CountBucketed(`changefeed.create.num_tables`, int64(len(targets)))
   238  
   239  		if details.SinkURI == `` {
   240  			err := distChangefeedFlow(ctx, p, 0 /* jobID */, details, progress, resultsCh)
   241  			return MaybeStripRetryableErrorMarker(err)
   242  		}
   243  
   244  		settings := p.ExecCfg().Settings
   245  		if err := utilccl.CheckEnterpriseEnabled(
   246  			settings, p.ExecCfg().ClusterID(), p.ExecCfg().Organization(), "CHANGEFEED",
   247  		); err != nil {
   248  			return err
   249  		}
   250  
   251  		// In the case where a user is executing a CREATE CHANGEFEED and is still
   252  		// waiting for the statement to return, we take the opportunity to ensure
   253  		// that the user has not made any obvious errors when specifying the sink in
   254  		// the CREATE CHANGEFEED statement. To do this, we create a "canary" sink,
   255  		// which will be immediately closed, only to check for errors.
   256  		{
   257  			nodeID, err := p.ExtendedEvalContext().NodeID.OptionalNodeIDErr(48274)
   258  			if err != nil {
   259  				return err
   260  			}
   261  			var nilOracle timestampLowerBoundOracle
   262  			canarySink, err := getSink(
   263  				ctx, details.SinkURI, nodeID, details.Opts, details.Targets,
   264  				settings, nilOracle, p.ExecCfg().DistSQLSrv.ExternalStorageFromURI,
   265  			)
   266  			if err != nil {
   267  				return MaybeStripRetryableErrorMarker(err)
   268  			}
   269  			if err := canarySink.Close(); err != nil {
   270  				return err
   271  			}
   272  		}
   273  
   274  		// Make a channel for runChangefeedFlow to signal once everything has
   275  		// been setup okay. This intentionally abuses what would normally be
   276  		// hooked up to resultsCh to avoid a bunch of extra plumbing.
   277  		startedCh := make(chan tree.Datums)
   278  
   279  		// The below block creates the job and if there's an initial scan, protects
   280  		// the data required for that scan. We protect the data here rather than in
   281  		// Resume to shorten the window that data may be GC'd. The protected
   282  		// timestamps are removed and created during the execution of the changefeed
   283  		// by the changeFrontier when checkpointing progress. Additionally protected
   284  		// timestamps are removed in OnFailOrCancel. See the comment on
   285  		// changeFrontier.manageProtectedTimestamps for more details on the handling of
   286  		// protected timestamps.
   287  		var sj *jobs.StartableJob
   288  		{
   289  			var protectedTimestampID uuid.UUID
   290  			var spansToProtect []roachpb.Span
   291  			if hasInitialScan := initialScanFromOptions(details.Opts); hasInitialScan {
   292  				protectedTimestampID = uuid.MakeV4()
   293  				spansToProtect = makeSpansToProtect(details.Targets)
   294  				progress.GetChangefeed().ProtectedTimestampRecord = protectedTimestampID
   295  			}
   296  
   297  			jr := jobs.Record{
   298  				Description: jobDescription,
   299  				Username:    p.User(),
   300  				DescriptorIDs: func() (sqlDescIDs []sqlbase.ID) {
   301  					for _, desc := range targetDescs {
   302  						sqlDescIDs = append(sqlDescIDs, desc.GetID())
   303  					}
   304  					return sqlDescIDs
   305  				}(),
   306  				Details:  details,
   307  				Progress: *progress.GetChangefeed(),
   308  			}
   309  			createJobAndProtectedTS := func(ctx context.Context, txn *kv.Txn) (err error) {
   310  				sj, err = p.ExecCfg().JobRegistry.CreateStartableJobWithTxn(ctx, jr, txn, startedCh)
   311  				if err != nil {
   312  					return err
   313  				}
   314  				if protectedTimestampID == uuid.Nil {
   315  					return nil
   316  				}
   317  				ptr := jobsprotectedts.MakeRecord(protectedTimestampID, *sj.ID(),
   318  					statementTime, spansToProtect)
   319  				return p.ExecCfg().ProtectedTimestampProvider.Protect(ctx, txn, ptr)
   320  			}
   321  			if err := p.ExecCfg().DB.Txn(ctx, createJobAndProtectedTS); err != nil {
   322  				if sj != nil {
   323  					if err := sj.CleanupOnRollback(ctx); err != nil {
   324  						log.Warningf(ctx, "failed to cleanup aborted job: %v", err)
   325  					}
   326  				}
   327  				return err
   328  			}
   329  			// If we created a protected timestamp for an initial scan, verify it.
   330  			// Doing this synchronously here rather than asynchronously later provides
   331  			// a nice UX win in the case that the data isn't actually available.
   332  			if protectedTimestampID != uuid.Nil {
   333  				if err := p.ExecCfg().ProtectedTimestampProvider.Verify(ctx, protectedTimestampID); err != nil {
   334  					if cancelErr := sj.Cancel(ctx); cancelErr != nil {
   335  						if ctx.Err() == nil {
   336  							log.Warningf(ctx, "failed to cancel job: %v", cancelErr)
   337  						}
   338  					}
   339  					return err
   340  				}
   341  			}
   342  		}
   343  
   344  		// Start the job and wait for it to signal on startedCh.
   345  		errCh, err := sj.Start(ctx)
   346  		if err != nil {
   347  			return err
   348  		}
   349  		select {
   350  		case <-ctx.Done():
   351  			return ctx.Err()
   352  		case err := <-errCh:
   353  			return err
   354  		case <-startedCh:
   355  			// The feed set up without error, return control to the user.
   356  		}
   357  		resultsCh <- tree.Datums{
   358  			tree.NewDInt(tree.DInt(*sj.ID())),
   359  		}
   360  		return nil
   361  	}
   362  	return fn, header, nil, avoidBuffering, nil
   363  }
   364  
   365  func changefeedJobDescription(
   366  	p sql.PlanHookState, changefeed *tree.CreateChangefeed, sinkURI string, opts map[string]string,
   367  ) (string, error) {
   368  	cleanedSinkURI, err := cloud.SanitizeExternalStorageURI(sinkURI, []string{changefeedbase.SinkParamSASLPassword})
   369  	if err != nil {
   370  		return "", err
   371  	}
   372  	c := &tree.CreateChangefeed{
   373  		Targets: changefeed.Targets,
   374  		SinkURI: tree.NewDString(cleanedSinkURI),
   375  	}
   376  	for k, v := range opts {
   377  		opt := tree.KVOption{Key: tree.Name(k)}
   378  		if len(v) > 0 {
   379  			opt.Value = tree.NewDString(v)
   380  		}
   381  		c.Options = append(c.Options, opt)
   382  	}
   383  	sort.Slice(c.Options, func(i, j int) bool { return c.Options[i].Key < c.Options[j].Key })
   384  	ann := p.ExtendedEvalContext().Annotations
   385  	return tree.AsStringWithFQNames(c, ann), nil
   386  }
   387  
   388  func validateDetails(details jobspb.ChangefeedDetails) (jobspb.ChangefeedDetails, error) {
   389  	if details.Opts == nil {
   390  		// The proto MarshalTo method omits the Opts field if the map is empty.
   391  		// So, if no options were specified by the user, Opts will be nil when
   392  		// the job gets restarted.
   393  		details.Opts = map[string]string{}
   394  	}
   395  	{
   396  		const opt = changefeedbase.OptResolvedTimestamps
   397  		if o, ok := details.Opts[opt]; ok && o != `` {
   398  			if d, err := time.ParseDuration(o); err != nil {
   399  				return jobspb.ChangefeedDetails{}, err
   400  			} else if d < 0 {
   401  				return jobspb.ChangefeedDetails{}, errors.Errorf(
   402  					`negative durations are not accepted: %s='%s'`, opt, o)
   403  			}
   404  		}
   405  	}
   406  	{
   407  		const opt = changefeedbase.OptSchemaChangeEvents
   408  		switch v := changefeedbase.SchemaChangeEventClass(details.Opts[opt]); v {
   409  		case ``, changefeedbase.OptSchemaChangeEventClassDefault:
   410  			details.Opts[opt] = string(changefeedbase.OptSchemaChangeEventClassDefault)
   411  		case changefeedbase.OptSchemaChangeEventClassColumnChange:
   412  			// No-op
   413  		default:
   414  			return jobspb.ChangefeedDetails{}, errors.Errorf(
   415  				`unknown %s: %s`, opt, v)
   416  		}
   417  	}
   418  	{
   419  		const opt = changefeedbase.OptSchemaChangePolicy
   420  		switch v := changefeedbase.SchemaChangePolicy(details.Opts[opt]); v {
   421  		case ``, changefeedbase.OptSchemaChangePolicyBackfill:
   422  			details.Opts[opt] = string(changefeedbase.OptSchemaChangePolicyBackfill)
   423  		case changefeedbase.OptSchemaChangePolicyNoBackfill:
   424  			// No-op
   425  		case changefeedbase.OptSchemaChangePolicyStop:
   426  			// No-op
   427  		default:
   428  			return jobspb.ChangefeedDetails{}, errors.Errorf(
   429  				`unknown %s: %s`, opt, v)
   430  		}
   431  	}
   432  	{
   433  		_, withInitialScan := details.Opts[changefeedbase.OptInitialScan]
   434  		_, noInitialScan := details.Opts[changefeedbase.OptNoInitialScan]
   435  		if withInitialScan && noInitialScan {
   436  			return jobspb.ChangefeedDetails{}, errors.Errorf(
   437  				`cannot specify both %s and %s`, changefeedbase.OptInitialScan,
   438  				changefeedbase.OptNoInitialScan)
   439  		}
   440  	}
   441  	{
   442  		const opt = changefeedbase.OptEnvelope
   443  		switch v := changefeedbase.EnvelopeType(details.Opts[opt]); v {
   444  		case changefeedbase.OptEnvelopeRow, changefeedbase.OptEnvelopeDeprecatedRow:
   445  			details.Opts[opt] = string(changefeedbase.OptEnvelopeRow)
   446  		case changefeedbase.OptEnvelopeKeyOnly:
   447  			details.Opts[opt] = string(changefeedbase.OptEnvelopeKeyOnly)
   448  		case ``, changefeedbase.OptEnvelopeWrapped:
   449  			details.Opts[opt] = string(changefeedbase.OptEnvelopeWrapped)
   450  		default:
   451  			return jobspb.ChangefeedDetails{}, errors.Errorf(
   452  				`unknown %s: %s`, opt, v)
   453  		}
   454  	}
   455  	{
   456  		const opt = changefeedbase.OptFormat
   457  		switch v := changefeedbase.FormatType(details.Opts[opt]); v {
   458  		case ``, changefeedbase.OptFormatJSON:
   459  			details.Opts[opt] = string(changefeedbase.OptFormatJSON)
   460  		case changefeedbase.OptFormatAvro:
   461  			// No-op.
   462  		default:
   463  			return jobspb.ChangefeedDetails{}, errors.Errorf(
   464  				`unknown %s: %s`, opt, v)
   465  		}
   466  	}
   467  	return details, nil
   468  }
   469  
   470  func validateChangefeedTable(
   471  	targets jobspb.ChangefeedTargets, tableDesc *sqlbase.TableDescriptor,
   472  ) error {
   473  	t, ok := targets[tableDesc.ID]
   474  	if !ok {
   475  		return errors.Errorf(`unwatched table: %s`, tableDesc.Name)
   476  	}
   477  
   478  	// Technically, the only non-user table known not to work is system.jobs
   479  	// (which creates a cycle since the resolved timestamp high-water mark is
   480  	// saved in it), but there are subtle differences in the way many of them
   481  	// work and this will be under-tested, so disallow them all until demand
   482  	// dictates.
   483  	if tableDesc.ID < keys.MinUserDescID {
   484  		return errors.Errorf(`CHANGEFEEDs are not supported on system tables`)
   485  	}
   486  	if tableDesc.IsView() {
   487  		return errors.Errorf(`CHANGEFEED cannot target views: %s`, tableDesc.Name)
   488  	}
   489  	if tableDesc.IsVirtualTable() {
   490  		return errors.Errorf(`CHANGEFEED cannot target virtual tables: %s`, tableDesc.Name)
   491  	}
   492  	if tableDesc.IsSequence() {
   493  		return errors.Errorf(`CHANGEFEED cannot target sequences: %s`, tableDesc.Name)
   494  	}
   495  	if len(tableDesc.Families) != 1 {
   496  		return errors.Errorf(
   497  			`CHANGEFEEDs are currently supported on tables with exactly 1 column family: %s has %d`,
   498  			tableDesc.Name, len(tableDesc.Families))
   499  	}
   500  
   501  	if tableDesc.State == sqlbase.TableDescriptor_DROP {
   502  		return errors.Errorf(`"%s" was dropped or truncated`, t.StatementTimeName)
   503  	}
   504  	if tableDesc.Name != t.StatementTimeName {
   505  		return errors.Errorf(`"%s" was renamed to "%s"`, t.StatementTimeName, tableDesc.Name)
   506  	}
   507  
   508  	// TODO(mrtracy): re-enable this when allow-backfill option is added.
   509  	// if tableDesc.HasColumnBackfillMutation() {
   510  	// 	return errors.Errorf(`CHANGEFEEDs cannot operate on tables being backfilled`)
   511  	// }
   512  
   513  	return nil
   514  }
   515  
   516  type changefeedResumer struct {
   517  	job *jobs.Job
   518  }
   519  
   520  // generateChangefeedSessionID generates a unique string that is used to
   521  // prevent overwriting of output files by the cloudStorageSink.
   522  func generateChangefeedSessionID() string {
   523  	// We read exactly 8 random bytes. 8 bytes should be enough because:
   524  	// Consider that each new session for a changefeed job can occur at the
   525  	// same highWater timestamp for its catch up scan. This session ID is
   526  	// used to ensure that a session emitting files with the same timestamp
   527  	// as the session before doesn't clobber existing files. Let's assume that
   528  	// each of these runs for 0 seconds. Our node liveness duration is currently
   529  	// 9 seconds, but let's go with a conservative duration of 1 second.
   530  	// With 8 bytes using the rough approximation for the birthday problem
   531  	// https://en.wikipedia.org/wiki/Birthday_problem#Square_approximation, we
   532  	// will have a 50% chance of a single collision after sqrt(2^64) = 2^32
   533  	// sessions. So if we start a new job every second, we get a coin flip chance of
   534  	// single collision after 136 years. With this same approximation, we get
   535  	// something like 220 days to have a 0.001% chance of a collision. In practice,
   536  	// jobs are likely to run for longer and it's likely to take longer for
   537  	// job adoption, so we should be good with 8 bytes. Similarly, it's clear that
   538  	// 16 would be way overkill. 4 bytes gives us a 50% chance of collision after
   539  	// 65K sessions at the same timestamp.
   540  	const size = 8
   541  	p := make([]byte, size)
   542  	buf := make([]byte, hex.EncodedLen(size))
   543  	rand.Read(p)
   544  	hex.Encode(buf, p)
   545  	return string(buf)
   546  }
   547  
   548  // Resume is part of the jobs.Resumer interface.
   549  func (b *changefeedResumer) Resume(
   550  	ctx context.Context, planHookState interface{}, startedCh chan<- tree.Datums,
   551  ) error {
   552  	phs := planHookState.(sql.PlanHookState)
   553  	execCfg := phs.ExecCfg()
   554  	jobID := *b.job.ID()
   555  	details := b.job.Details().(jobspb.ChangefeedDetails)
   556  	progress := b.job.Progress()
   557  
   558  	// We'd like to avoid failing a changefeed unnecessarily, so when an error
   559  	// bubbles up to this level, we'd like to "retry" the flow if possible. This
   560  	// could be because the sink is down or because a cockroach node has crashed
   561  	// or for many other reasons.
   562  	opts := retry.Options{
   563  		InitialBackoff: 5 * time.Millisecond,
   564  		Multiplier:     2,
   565  		MaxBackoff:     10 * time.Second,
   566  	}
   567  	var err error
   568  	for r := retry.StartWithCtx(ctx, opts); r.Next(); {
   569  		if err = distChangefeedFlow(ctx, phs, jobID, details, progress, startedCh); err == nil {
   570  			return nil
   571  		}
   572  		if !IsRetryableError(err) {
   573  			if ctx.Err() != nil {
   574  				return ctx.Err()
   575  			}
   576  
   577  			if flowinfra.IsFlowRetryableError(err) {
   578  				// We don't want to retry flowinfra retryable error in the retry loop above.
   579  				// This error currently indicates that this node is being drained.  As such,
   580  				// retries will not help.
   581  				// Instead, we want to make sure that the changefeed job is not marked failed
   582  				// due to a transient, retryable error.
   583  				err = jobs.NewRetryJobError(fmt.Sprintf("retryable flow error: %+v", err))
   584  			}
   585  
   586  			log.Warningf(ctx, `CHANGEFEED job %d returning with error: %+v`, jobID, err)
   587  			return err
   588  		}
   589  
   590  		log.Warningf(ctx, `CHANGEFEED job %d encountered retryable error: %v`, jobID, err)
   591  		if metrics, ok := execCfg.JobRegistry.MetricsStruct().Changefeed.(*Metrics); ok {
   592  			metrics.ErrorRetries.Inc(1)
   593  		}
   594  		// Re-load the job in order to update our progress object, which may have
   595  		// been updated by the changeFrontier processor since the flow started.
   596  		reloadedJob, reloadErr := execCfg.JobRegistry.LoadJob(ctx, jobID)
   597  		if reloadErr != nil {
   598  			if ctx.Err() != nil {
   599  				return ctx.Err()
   600  			}
   601  			log.Warningf(ctx, `CHANGEFEED job %d could not reload job progress; `+
   602  				`continuing from last known high-water of %s: %v`,
   603  				jobID, progress.GetHighWater(), reloadErr)
   604  		} else {
   605  			progress = reloadedJob.Progress()
   606  		}
   607  
   608  		// startedCh is normally used to signal back to the creator of the job that
   609  		// the job has started; however, in this case nothing will ever receive
   610  		// on the channel, causing the changefeed flow to block. Replace it with
   611  		// a dummy channel.
   612  		startedCh = make(chan tree.Datums, 1)
   613  	}
   614  	// We only hit this if `r.Next()` returns false, which right now only happens
   615  	// on context cancellation.
   616  	return errors.Wrap(err, `ran out of retries`)
   617  }
   618  
   619  // OnFailOrCancel is part of the jobs.Resumer interface.
   620  func (b *changefeedResumer) OnFailOrCancel(ctx context.Context, planHookState interface{}) error {
   621  	phs := planHookState.(sql.PlanHookState)
   622  	execCfg := phs.ExecCfg()
   623  	progress := b.job.Progress()
   624  	b.maybeCleanUpProtectedTimestamp(ctx, execCfg.DB, execCfg.ProtectedTimestampProvider,
   625  		progress.GetChangefeed().ProtectedTimestampRecord)
   626  	return nil
   627  }
   628  
   629  // Try to clean up a protected timestamp created by the changefeed.
   630  func (b *changefeedResumer) maybeCleanUpProtectedTimestamp(
   631  	ctx context.Context, db *kv.DB, pts protectedts.Storage, ptsID uuid.UUID,
   632  ) {
   633  	if ptsID == uuid.Nil {
   634  		return
   635  	}
   636  	if err := db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
   637  		return pts.Release(ctx, txn, ptsID)
   638  	}); err != nil && !crdberrors.Is(err, protectedts.ErrNotExists) {
   639  		// NB: The record should get cleaned up by the reconciliation loop.
   640  		// No good reason to cause more trouble by returning an error here.
   641  		// Log and move on.
   642  		log.Warningf(ctx, "failed to remove protected timestamp record %v: %v", ptsID, err)
   643  	}
   644  }
   645  
   646  var _ jobs.PauseRequester = (*changefeedResumer)(nil)
   647  
   648  // OnPauseRequest implements jobs.PauseRequester. If this changefeed is being
   649  // paused, we want to install a protected timestamp at the most recent high
   650  // watermark if there isn't already one.
   651  func (b *changefeedResumer) OnPauseRequest(
   652  	ctx context.Context, planHookState interface{}, txn *kv.Txn, progress *jobspb.Progress,
   653  ) error {
   654  	details := b.job.Details().(jobspb.ChangefeedDetails)
   655  	if _, shouldPause := details.Opts[changefeedbase.OptProtectDataFromGCOnPause]; !shouldPause {
   656  		return nil
   657  	}
   658  
   659  	cp := progress.GetChangefeed()
   660  
   661  	// If we already have a protected timestamp record, keep it where it is.
   662  	if cp.ProtectedTimestampRecord != uuid.Nil {
   663  		return nil
   664  	}
   665  
   666  	resolved := progress.GetHighWater()
   667  	if resolved == nil {
   668  		// This should only happen if the job was created in a version that did not
   669  		// use protected timestamps but has yet to checkpoint its high water.
   670  		// Changefeeds from older versions didn't get protected timestamps so it's
   671  		// fine to not protect this one. In newer versions changefeeds which perform
   672  		// an initial scan at the statement time (and don't have an initial high
   673  		// water) will have a protected timestamp.
   674  		return nil
   675  	}
   676  
   677  	pts := planHookState.(sql.PlanHookState).ExecCfg().ProtectedTimestampProvider
   678  	return createProtectedTimestampRecord(ctx, pts, txn, *b.job.ID(),
   679  		details.Targets, *resolved, cp)
   680  }