github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/ccl/changefeedccl/schemafeed/schema_feed.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Licensed as a CockroachDB Enterprise file under the Cockroach Community
     4  // License (the "License"); you may not use this file except in compliance with
     5  // the License. You may obtain a copy of the License at
     6  //
     7  //     https://github.com/cockroachdb/cockroach/blob/master/licenses/CCL.txt
     8  
     9  package schemafeed
    10  
    11  import (
    12  	"context"
    13  	"fmt"
    14  	"sort"
    15  	"time"
    16  
    17  	"github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/changefeedbase"
    18  	"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
    19  	"github.com/cockroachdb/cockroach/pkg/keys"
    20  	"github.com/cockroachdb/cockroach/pkg/kv"
    21  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    22  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    23  	"github.com/cockroachdb/cockroach/pkg/sql/catalog/lease"
    24  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    25  	"github.com/cockroachdb/cockroach/pkg/storage"
    26  	"github.com/cockroachdb/cockroach/pkg/util/encoding"
    27  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    28  	"github.com/cockroachdb/cockroach/pkg/util/log"
    29  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    30  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    31  	"github.com/cockroachdb/errors"
    32  )
    33  
    34  // TODO(ajwerner): Ideally we could have a centralized worker which reads the
    35  // table descriptors instead of polling from each changefeed. This wouldn't be
    36  // too hard. Each registered queue would have a start time. You'd scan from the
    37  // earliest and just ingest the relevant descriptors.
    38  
    39  // TableEvent represents a change to a table descriptor.
    40  type TableEvent struct {
    41  	Before, After *sqlbase.TableDescriptor
    42  }
    43  
    44  // Timestamp refers to the ModificationTime of the After table descriptor.
    45  func (e TableEvent) Timestamp() hlc.Timestamp {
    46  	return e.After.ModificationTime
    47  }
    48  
    49  // Config configures a SchemaFeed.
    50  type Config struct {
    51  	DB       *kv.DB
    52  	Clock    *hlc.Clock
    53  	Settings *cluster.Settings
    54  	Targets  jobspb.ChangefeedTargets
    55  
    56  	// SchemaChangeEvents controls the class of events which are emitted by this
    57  	// SchemaFeed.
    58  	SchemaChangeEvents changefeedbase.SchemaChangeEventClass
    59  
    60  	// InitialHighWater is the timestamp after which events should occur.
    61  	//
    62  	// NB: When clients want to create a changefeed which has a resolved timestamp
    63  	// of ts1, they care about write which occur at ts1.Next() and later but they
    64  	// should scan the tables as of ts1. This is important so that writes which
    65  	// change the table at ts1.Next() are emitted as an event.
    66  	InitialHighWater hlc.Timestamp
    67  
    68  	// LeaseManager is used to ensure that when an event is emitted that at a higher
    69  	// level it is ensured that the right table descriptor will be used for the
    70  	// event if this lease manager is used.
    71  	//
    72  	// TODO(ajwerner): Should this live underneath the FilterFunc?
    73  	// Should there be another function to decide whether to update the
    74  	// lease manager?
    75  	LeaseManager *lease.Manager
    76  }
    77  
    78  // SchemaFeed tracks changes to a set of tables and exports them as a queue of
    79  // events. The queue allows clients to provide a timestamp at or before which
    80  // all events must be seen by the time Peek or Pop returns. This allows clients
    81  // to ensure that all table events which precede some rangefeed event are seen
    82  // before propagating that rangefeed event.
    83  //
    84  // Internally, two timestamps are tracked. The high-water is the highest
    85  // timestamp such that every version of a TableDescriptor has met a provided
    86  // invariant (via `validateFn`). An error timestamp is also kept, which is the
    87  // lowest timestamp where at least one table doesn't meet the invariant.
    88  type SchemaFeed struct {
    89  	filter   tableEventFilter
    90  	db       *kv.DB
    91  	clock    *hlc.Clock
    92  	settings *cluster.Settings
    93  	targets  jobspb.ChangefeedTargets
    94  	leaseMgr *lease.Manager
    95  	mu       struct {
    96  		syncutil.Mutex
    97  
    98  		started bool
    99  
   100  		// the highest known valid timestamp
   101  		highWater hlc.Timestamp
   102  
   103  		// the lowest known invalid timestamp
   104  		errTS hlc.Timestamp
   105  
   106  		// the error associated with errTS
   107  		err error
   108  
   109  		// callers waiting on a timestamp to be resolved as valid or invalid
   110  		waiters []tableHistoryWaiter
   111  
   112  		// events is a sorted list of table events which have not been popped and
   113  		// are at or below highWater.
   114  		events []TableEvent
   115  
   116  		// previousTableVersion is a map from tableID to the most recent version
   117  		// of the table descriptor seen by the poller. This is needed to determine
   118  		// when a backilling mutation has successfully completed - this can only
   119  		// be determining by comparing a version to the previous version.
   120  		previousTableVersion map[sqlbase.ID]*sqlbase.TableDescriptor
   121  	}
   122  }
   123  
   124  type tableHistoryWaiter struct {
   125  	ts    hlc.Timestamp
   126  	errCh chan error
   127  }
   128  
   129  // New creates SchemaFeed with the given Config.
   130  func New(cfg Config) *SchemaFeed {
   131  	// TODO(ajwerner): validate config.
   132  	m := &SchemaFeed{
   133  		filter:   schemaChangeEventFilters[cfg.SchemaChangeEvents],
   134  		db:       cfg.DB,
   135  		clock:    cfg.Clock,
   136  		settings: cfg.Settings,
   137  		targets:  cfg.Targets,
   138  		leaseMgr: cfg.LeaseManager,
   139  	}
   140  	m.mu.previousTableVersion = make(map[sqlbase.ID]*sqlbase.TableDescriptor)
   141  	m.mu.highWater = cfg.InitialHighWater
   142  	return m
   143  }
   144  
   145  func (tf *SchemaFeed) markStarted() error {
   146  	tf.mu.Lock()
   147  	defer tf.mu.Unlock()
   148  	if tf.mu.started {
   149  		return errors.AssertionFailedf("SchemaFeed started more than once")
   150  	}
   151  	tf.mu.started = true
   152  	return nil
   153  }
   154  
   155  // Run will run the SchemaFeed. It is an error to run a feed more than once.
   156  func (tf *SchemaFeed) Run(ctx context.Context) error {
   157  	if err := tf.markStarted(); err != nil {
   158  		return err
   159  	}
   160  
   161  	// Fetch the table descs as of the initial highWater and prime the table
   162  	// history with them. This addresses #41694 where we'd skip the rest of a
   163  	// backfill if the changefeed was paused/unpaused during it. The bug was that
   164  	// the changefeed wouldn't notice the table descriptor had changed (and thus
   165  	// we were in the backfill state) when it restarted.
   166  	if err := tf.primeInitialTableDescs(ctx); err != nil {
   167  		return err
   168  	}
   169  	// We want to initialize the table history which will pull the initial version
   170  	// and then begin polling.
   171  	//
   172  	// TODO(ajwerner): As written the polling will add table events forever.
   173  	// If there are a ton of table events we'll buffer them all in RAM. There are
   174  	// cases where this might be problematic. It could be mitigated with some
   175  	// memory monitoring. Probably better is to not poll eagerly but only poll if
   176  	// we don't have an event.
   177  	//
   178  	// After we add some sort of locking to prevent schema changes we should also
   179  	// only poll if we don't have a lease.
   180  	return tf.pollTableHistory(ctx)
   181  }
   182  
   183  func (tf *SchemaFeed) primeInitialTableDescs(ctx context.Context) error {
   184  	tf.mu.Lock()
   185  	initialTableDescTs := tf.mu.highWater
   186  	tf.mu.Unlock()
   187  	var initialDescs []*sqlbase.TableDescriptor
   188  	initialTableDescsFn := func(ctx context.Context, txn *kv.Txn) error {
   189  		initialDescs = initialDescs[:0]
   190  		txn.SetFixedTimestamp(ctx, initialTableDescTs)
   191  		// Note that all targets are currently guaranteed to be tables.
   192  		for tableID := range tf.targets {
   193  			tableDesc, err := sqlbase.GetTableDescFromID(ctx, txn, keys.SystemSQLCodec, tableID)
   194  			if err != nil {
   195  				return err
   196  			}
   197  			initialDescs = append(initialDescs, tableDesc)
   198  		}
   199  		return nil
   200  	}
   201  	if err := tf.db.Txn(ctx, initialTableDescsFn); err != nil {
   202  		return err
   203  	}
   204  	return tf.ingestDescriptors(ctx, hlc.Timestamp{}, initialTableDescTs, initialDescs, tf.validateTable)
   205  }
   206  
   207  func (tf *SchemaFeed) pollTableHistory(ctx context.Context) error {
   208  	for {
   209  		if err := tf.updateTableHistory(ctx, tf.clock.Now()); err != nil {
   210  			return err
   211  		}
   212  
   213  		select {
   214  		case <-ctx.Done():
   215  			return nil
   216  		case <-time.After(changefeedbase.TableDescriptorPollInterval.Get(&tf.settings.SV)):
   217  		}
   218  	}
   219  }
   220  
   221  func (tf *SchemaFeed) updateTableHistory(ctx context.Context, endTS hlc.Timestamp) error {
   222  	startTS := tf.highWater()
   223  	if endTS.LessEq(startTS) {
   224  		return nil
   225  	}
   226  	descs, err := fetchTableDescriptorVersions(ctx, tf.db, startTS, endTS, tf.targets)
   227  	if err != nil {
   228  		return err
   229  	}
   230  	return tf.ingestDescriptors(ctx, startTS, endTS, descs, tf.validateTable)
   231  }
   232  
   233  // Peek returns all events which have not been popped which happen at or
   234  // before the passed timestamp.
   235  func (tf *SchemaFeed) Peek(
   236  	ctx context.Context, atOrBefore hlc.Timestamp,
   237  ) (events []TableEvent, err error) {
   238  
   239  	return tf.peekOrPop(ctx, atOrBefore, false /* pop */)
   240  }
   241  
   242  // Pop pops events from the EventQueue.
   243  func (tf *SchemaFeed) Pop(
   244  	ctx context.Context, atOrBefore hlc.Timestamp,
   245  ) (events []TableEvent, err error) {
   246  	return tf.peekOrPop(ctx, atOrBefore, true /* pop */)
   247  }
   248  
   249  func (tf *SchemaFeed) peekOrPop(
   250  	ctx context.Context, atOrBefore hlc.Timestamp, pop bool,
   251  ) (events []TableEvent, err error) {
   252  	if err = tf.waitForTS(ctx, atOrBefore); err != nil {
   253  		return nil, err
   254  	}
   255  	tf.mu.Lock()
   256  	defer tf.mu.Unlock()
   257  	i := sort.Search(len(tf.mu.events), func(i int) bool {
   258  		return !tf.mu.events[i].Timestamp().LessEq(atOrBefore)
   259  	})
   260  	if i == -1 {
   261  		i = 0
   262  	}
   263  	events = tf.mu.events[:i]
   264  	if pop {
   265  		tf.mu.events = tf.mu.events[i:]
   266  	}
   267  	return events, nil
   268  }
   269  
   270  // highWater returns the current high-water timestamp.
   271  func (tf *SchemaFeed) highWater() hlc.Timestamp {
   272  	tf.mu.Lock()
   273  	highWater := tf.mu.highWater
   274  	tf.mu.Unlock()
   275  	return highWater
   276  }
   277  
   278  // waitForTS blocks until the given timestamp is less than or equal to the
   279  // high-water or error timestamp. In the latter case, the error is returned.
   280  //
   281  // If called twice with the same timestamp, two different errors may be returned
   282  // (since the error timestamp can recede). However, the return for a given
   283  // timestamp will never switch from nil to an error or vice-versa (assuming that
   284  // `validateFn` is deterministic and the ingested descriptors are read
   285  // transactionally).
   286  func (tf *SchemaFeed) waitForTS(ctx context.Context, ts hlc.Timestamp) error {
   287  	var errCh chan error
   288  
   289  	tf.mu.Lock()
   290  	highWater := tf.mu.highWater
   291  	var err error
   292  	if tf.mu.errTS != (hlc.Timestamp{}) && tf.mu.errTS.LessEq(ts) {
   293  		err = tf.mu.err
   294  	}
   295  	fastPath := err != nil || ts.LessEq(highWater)
   296  	if !fastPath {
   297  		errCh = make(chan error, 1)
   298  		tf.mu.waiters = append(tf.mu.waiters, tableHistoryWaiter{ts: ts, errCh: errCh})
   299  	}
   300  	tf.mu.Unlock()
   301  	if fastPath {
   302  		if log.V(1) {
   303  			log.Infof(ctx, "fastpath for %s: %v", ts, err)
   304  		}
   305  		return err
   306  	}
   307  
   308  	if log.V(1) {
   309  		log.Infof(ctx, "waiting for %s highwater", ts)
   310  	}
   311  	start := timeutil.Now()
   312  	select {
   313  	case <-ctx.Done():
   314  		return ctx.Err()
   315  	case err := <-errCh:
   316  		if log.V(1) {
   317  			log.Infof(ctx, "waited %s for %s highwater: %v", timeutil.Since(start), ts, err)
   318  		}
   319  		return err
   320  	}
   321  }
   322  
   323  func descLess(a, b *sqlbase.TableDescriptor) bool {
   324  	if a.ModificationTime.Equal(b.ModificationTime) {
   325  		return a.ID < b.ID
   326  	}
   327  	return a.ModificationTime.Less(b.ModificationTime)
   328  }
   329  
   330  // ingestDescriptors checks the given descriptors against the invariant check
   331  // function and adjusts the high-water or error timestamp appropriately. It is
   332  // required that the descriptors represent a transactional kv read between the
   333  // two given timestamps.
   334  //
   335  // validateFn is exposed for testing, in production it is tf.validateTable.
   336  func (tf *SchemaFeed) ingestDescriptors(
   337  	ctx context.Context,
   338  	startTS, endTS hlc.Timestamp,
   339  	descs []*sqlbase.TableDescriptor,
   340  	validateFn func(ctx context.Context, desc *sqlbase.TableDescriptor) error,
   341  ) error {
   342  	sort.Slice(descs, func(i, j int) bool { return descLess(descs[i], descs[j]) })
   343  	var validateErr error
   344  	for _, desc := range descs {
   345  		if err := validateFn(ctx, desc); validateErr == nil {
   346  			validateErr = err
   347  		}
   348  	}
   349  	return tf.adjustTimestamps(startTS, endTS, validateErr)
   350  }
   351  
   352  // adjustTimestamps adjusts the high-water or error timestamp appropriately.
   353  func (tf *SchemaFeed) adjustTimestamps(startTS, endTS hlc.Timestamp, validateErr error) error {
   354  	tf.mu.Lock()
   355  	defer tf.mu.Unlock()
   356  
   357  	if validateErr != nil {
   358  		// don't care about startTS in the invalid case
   359  		if tf.mu.errTS == (hlc.Timestamp{}) || endTS.Less(tf.mu.errTS) {
   360  			tf.mu.errTS = endTS
   361  			tf.mu.err = validateErr
   362  			newWaiters := make([]tableHistoryWaiter, 0, len(tf.mu.waiters))
   363  			for _, w := range tf.mu.waiters {
   364  				if w.ts.Less(tf.mu.errTS) {
   365  					newWaiters = append(newWaiters, w)
   366  					continue
   367  				}
   368  				w.errCh <- validateErr
   369  			}
   370  			tf.mu.waiters = newWaiters
   371  		}
   372  		return validateErr
   373  	}
   374  
   375  	if tf.mu.highWater.Less(startTS) {
   376  		return errors.Errorf(`gap between %s and %s`, tf.mu.highWater, startTS)
   377  	}
   378  	if tf.mu.highWater.Less(endTS) {
   379  		tf.mu.highWater = endTS
   380  		newWaiters := make([]tableHistoryWaiter, 0, len(tf.mu.waiters))
   381  		for _, w := range tf.mu.waiters {
   382  			if tf.mu.highWater.Less(w.ts) {
   383  				newWaiters = append(newWaiters, w)
   384  				continue
   385  			}
   386  			w.errCh <- nil
   387  		}
   388  		tf.mu.waiters = newWaiters
   389  	}
   390  	return nil
   391  }
   392  func (e TableEvent) String() string {
   393  	return formatEvent(e)
   394  }
   395  
   396  func formatDesc(desc *sqlbase.TableDescriptor) string {
   397  	return fmt.Sprintf("%d:%d@%v", desc.ID, desc.Version, desc.ModificationTime)
   398  }
   399  
   400  func formatEvent(e TableEvent) string {
   401  	return fmt.Sprintf("%v->%v", formatDesc(e.Before), formatDesc(e.After))
   402  }
   403  
   404  func (tf *SchemaFeed) validateTable(ctx context.Context, desc *sqlbase.TableDescriptor) error {
   405  	if err := changefeedbase.ValidateTable(tf.targets, desc); err != nil {
   406  		return err
   407  	}
   408  	tf.mu.Lock()
   409  	defer tf.mu.Unlock()
   410  	log.Infof(ctx, "validate %v", formatDesc(desc))
   411  	if lastVersion, ok := tf.mu.previousTableVersion[desc.ID]; ok {
   412  		// NB: Writes can occur to a table
   413  		if desc.ModificationTime.LessEq(lastVersion.ModificationTime) {
   414  			return nil
   415  		}
   416  
   417  		// To avoid race conditions with the lease manager, at this point we force
   418  		// the manager to acquire the freshest descriptor of this table from the
   419  		// store. In normal operation, the lease manager returns the newest
   420  		// descriptor it knows about for the timestamp, assuming it's still
   421  		// allowed; without this explicit load, the lease manager might therefore
   422  		// return the previous version of the table, which is still technically
   423  		// allowed by the schema change system.
   424  		if err := tf.leaseMgr.AcquireFreshestFromStore(ctx, desc.ID); err != nil {
   425  			return err
   426  		}
   427  
   428  		e := TableEvent{
   429  			Before: lastVersion,
   430  			After:  desc,
   431  		}
   432  		shouldFilter, err := tf.filter.shouldFilter(ctx, e)
   433  		log.Infof(ctx, "validate shouldFilter %v %v", formatEvent(e), shouldFilter)
   434  		if err != nil {
   435  			return err
   436  		}
   437  		if !shouldFilter {
   438  			tf.mu.events = append(tf.mu.events, e)
   439  			sort.Slice(tf.mu.events, func(i, j int) bool {
   440  				return descLess(tf.mu.events[i].After, tf.mu.events[j].After)
   441  			})
   442  		}
   443  	}
   444  	tf.mu.previousTableVersion[desc.ID] = desc
   445  	return nil
   446  }
   447  
   448  func fetchTableDescriptorVersions(
   449  	ctx context.Context, db *kv.DB, startTS, endTS hlc.Timestamp, targets jobspb.ChangefeedTargets,
   450  ) ([]*sqlbase.TableDescriptor, error) {
   451  	if log.V(2) {
   452  		log.Infof(ctx, `fetching table descs (%s,%s]`, startTS, endTS)
   453  	}
   454  	start := timeutil.Now()
   455  	span := roachpb.Span{Key: keys.TODOSQLCodec.TablePrefix(keys.DescriptorTableID)}
   456  	span.EndKey = span.Key.PrefixEnd()
   457  	header := roachpb.Header{Timestamp: endTS}
   458  	req := &roachpb.ExportRequest{
   459  		RequestHeader: roachpb.RequestHeaderFromSpan(span),
   460  		StartTime:     startTS,
   461  		MVCCFilter:    roachpb.MVCCFilter_All,
   462  		ReturnSST:     true,
   463  		OmitChecksum:  true,
   464  	}
   465  	res, pErr := kv.SendWrappedWith(ctx, db.NonTransactionalSender(), header, req)
   466  	if log.V(2) {
   467  		log.Infof(ctx, `fetched table descs (%s,%s] took %s`, startTS, endTS, timeutil.Since(start))
   468  	}
   469  	if pErr != nil {
   470  		err := pErr.GoError()
   471  		return nil, errors.Wrapf(err, `fetching changes for %s`, span)
   472  	}
   473  
   474  	var tableDescs []*sqlbase.TableDescriptor
   475  	for _, file := range res.(*roachpb.ExportResponse).Files {
   476  		if err := func() error {
   477  			it, err := storage.NewMemSSTIterator(file.SST, false /* verify */)
   478  			if err != nil {
   479  				return err
   480  			}
   481  			defer it.Close()
   482  			for it.SeekGE(storage.NilKey); ; it.Next() {
   483  				if ok, err := it.Valid(); err != nil {
   484  					return err
   485  				} else if !ok {
   486  					return nil
   487  				}
   488  				k := it.UnsafeKey()
   489  				remaining, _, _, err := keys.TODOSQLCodec.DecodeIndexPrefix(k.Key)
   490  				if err != nil {
   491  					return err
   492  				}
   493  				_, tableID, err := encoding.DecodeUvarintAscending(remaining)
   494  				if err != nil {
   495  					return err
   496  				}
   497  				origName, ok := targets[sqlbase.ID(tableID)]
   498  				if !ok {
   499  					// Uninteresting table.
   500  					continue
   501  				}
   502  				unsafeValue := it.UnsafeValue()
   503  				if unsafeValue == nil {
   504  					return errors.Errorf(`"%v" was dropped or truncated`, origName)
   505  				}
   506  				value := roachpb.Value{RawBytes: unsafeValue}
   507  				var desc sqlbase.Descriptor
   508  				if err := value.GetProto(&desc); err != nil {
   509  					return err
   510  				}
   511  				if tableDesc := desc.Table(k.Timestamp); tableDesc != nil {
   512  					tableDescs = append(tableDescs, tableDesc)
   513  				}
   514  			}
   515  		}(); err != nil {
   516  			return nil, err
   517  		}
   518  	}
   519  	return tableDescs, nil
   520  }