github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/ccl/changefeedccl/schemafeed/schema_feed.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Licensed as a CockroachDB Enterprise file under the Cockroach Community 4 // License (the "License"); you may not use this file except in compliance with 5 // the License. You may obtain a copy of the License at 6 // 7 // https://github.com/cockroachdb/cockroach/blob/master/licenses/CCL.txt 8 9 package schemafeed 10 11 import ( 12 "context" 13 "fmt" 14 "sort" 15 "time" 16 17 "github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/changefeedbase" 18 "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" 19 "github.com/cockroachdb/cockroach/pkg/keys" 20 "github.com/cockroachdb/cockroach/pkg/kv" 21 "github.com/cockroachdb/cockroach/pkg/roachpb" 22 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 23 "github.com/cockroachdb/cockroach/pkg/sql/catalog/lease" 24 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 25 "github.com/cockroachdb/cockroach/pkg/storage" 26 "github.com/cockroachdb/cockroach/pkg/util/encoding" 27 "github.com/cockroachdb/cockroach/pkg/util/hlc" 28 "github.com/cockroachdb/cockroach/pkg/util/log" 29 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 30 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 31 "github.com/cockroachdb/errors" 32 ) 33 34 // TODO(ajwerner): Ideally we could have a centralized worker which reads the 35 // table descriptors instead of polling from each changefeed. This wouldn't be 36 // too hard. Each registered queue would have a start time. You'd scan from the 37 // earliest and just ingest the relevant descriptors. 38 39 // TableEvent represents a change to a table descriptor. 40 type TableEvent struct { 41 Before, After *sqlbase.TableDescriptor 42 } 43 44 // Timestamp refers to the ModificationTime of the After table descriptor. 45 func (e TableEvent) Timestamp() hlc.Timestamp { 46 return e.After.ModificationTime 47 } 48 49 // Config configures a SchemaFeed. 50 type Config struct { 51 DB *kv.DB 52 Clock *hlc.Clock 53 Settings *cluster.Settings 54 Targets jobspb.ChangefeedTargets 55 56 // SchemaChangeEvents controls the class of events which are emitted by this 57 // SchemaFeed. 58 SchemaChangeEvents changefeedbase.SchemaChangeEventClass 59 60 // InitialHighWater is the timestamp after which events should occur. 61 // 62 // NB: When clients want to create a changefeed which has a resolved timestamp 63 // of ts1, they care about write which occur at ts1.Next() and later but they 64 // should scan the tables as of ts1. This is important so that writes which 65 // change the table at ts1.Next() are emitted as an event. 66 InitialHighWater hlc.Timestamp 67 68 // LeaseManager is used to ensure that when an event is emitted that at a higher 69 // level it is ensured that the right table descriptor will be used for the 70 // event if this lease manager is used. 71 // 72 // TODO(ajwerner): Should this live underneath the FilterFunc? 73 // Should there be another function to decide whether to update the 74 // lease manager? 75 LeaseManager *lease.Manager 76 } 77 78 // SchemaFeed tracks changes to a set of tables and exports them as a queue of 79 // events. The queue allows clients to provide a timestamp at or before which 80 // all events must be seen by the time Peek or Pop returns. This allows clients 81 // to ensure that all table events which precede some rangefeed event are seen 82 // before propagating that rangefeed event. 83 // 84 // Internally, two timestamps are tracked. The high-water is the highest 85 // timestamp such that every version of a TableDescriptor has met a provided 86 // invariant (via `validateFn`). An error timestamp is also kept, which is the 87 // lowest timestamp where at least one table doesn't meet the invariant. 88 type SchemaFeed struct { 89 filter tableEventFilter 90 db *kv.DB 91 clock *hlc.Clock 92 settings *cluster.Settings 93 targets jobspb.ChangefeedTargets 94 leaseMgr *lease.Manager 95 mu struct { 96 syncutil.Mutex 97 98 started bool 99 100 // the highest known valid timestamp 101 highWater hlc.Timestamp 102 103 // the lowest known invalid timestamp 104 errTS hlc.Timestamp 105 106 // the error associated with errTS 107 err error 108 109 // callers waiting on a timestamp to be resolved as valid or invalid 110 waiters []tableHistoryWaiter 111 112 // events is a sorted list of table events which have not been popped and 113 // are at or below highWater. 114 events []TableEvent 115 116 // previousTableVersion is a map from tableID to the most recent version 117 // of the table descriptor seen by the poller. This is needed to determine 118 // when a backilling mutation has successfully completed - this can only 119 // be determining by comparing a version to the previous version. 120 previousTableVersion map[sqlbase.ID]*sqlbase.TableDescriptor 121 } 122 } 123 124 type tableHistoryWaiter struct { 125 ts hlc.Timestamp 126 errCh chan error 127 } 128 129 // New creates SchemaFeed with the given Config. 130 func New(cfg Config) *SchemaFeed { 131 // TODO(ajwerner): validate config. 132 m := &SchemaFeed{ 133 filter: schemaChangeEventFilters[cfg.SchemaChangeEvents], 134 db: cfg.DB, 135 clock: cfg.Clock, 136 settings: cfg.Settings, 137 targets: cfg.Targets, 138 leaseMgr: cfg.LeaseManager, 139 } 140 m.mu.previousTableVersion = make(map[sqlbase.ID]*sqlbase.TableDescriptor) 141 m.mu.highWater = cfg.InitialHighWater 142 return m 143 } 144 145 func (tf *SchemaFeed) markStarted() error { 146 tf.mu.Lock() 147 defer tf.mu.Unlock() 148 if tf.mu.started { 149 return errors.AssertionFailedf("SchemaFeed started more than once") 150 } 151 tf.mu.started = true 152 return nil 153 } 154 155 // Run will run the SchemaFeed. It is an error to run a feed more than once. 156 func (tf *SchemaFeed) Run(ctx context.Context) error { 157 if err := tf.markStarted(); err != nil { 158 return err 159 } 160 161 // Fetch the table descs as of the initial highWater and prime the table 162 // history with them. This addresses #41694 where we'd skip the rest of a 163 // backfill if the changefeed was paused/unpaused during it. The bug was that 164 // the changefeed wouldn't notice the table descriptor had changed (and thus 165 // we were in the backfill state) when it restarted. 166 if err := tf.primeInitialTableDescs(ctx); err != nil { 167 return err 168 } 169 // We want to initialize the table history which will pull the initial version 170 // and then begin polling. 171 // 172 // TODO(ajwerner): As written the polling will add table events forever. 173 // If there are a ton of table events we'll buffer them all in RAM. There are 174 // cases where this might be problematic. It could be mitigated with some 175 // memory monitoring. Probably better is to not poll eagerly but only poll if 176 // we don't have an event. 177 // 178 // After we add some sort of locking to prevent schema changes we should also 179 // only poll if we don't have a lease. 180 return tf.pollTableHistory(ctx) 181 } 182 183 func (tf *SchemaFeed) primeInitialTableDescs(ctx context.Context) error { 184 tf.mu.Lock() 185 initialTableDescTs := tf.mu.highWater 186 tf.mu.Unlock() 187 var initialDescs []*sqlbase.TableDescriptor 188 initialTableDescsFn := func(ctx context.Context, txn *kv.Txn) error { 189 initialDescs = initialDescs[:0] 190 txn.SetFixedTimestamp(ctx, initialTableDescTs) 191 // Note that all targets are currently guaranteed to be tables. 192 for tableID := range tf.targets { 193 tableDesc, err := sqlbase.GetTableDescFromID(ctx, txn, keys.SystemSQLCodec, tableID) 194 if err != nil { 195 return err 196 } 197 initialDescs = append(initialDescs, tableDesc) 198 } 199 return nil 200 } 201 if err := tf.db.Txn(ctx, initialTableDescsFn); err != nil { 202 return err 203 } 204 return tf.ingestDescriptors(ctx, hlc.Timestamp{}, initialTableDescTs, initialDescs, tf.validateTable) 205 } 206 207 func (tf *SchemaFeed) pollTableHistory(ctx context.Context) error { 208 for { 209 if err := tf.updateTableHistory(ctx, tf.clock.Now()); err != nil { 210 return err 211 } 212 213 select { 214 case <-ctx.Done(): 215 return nil 216 case <-time.After(changefeedbase.TableDescriptorPollInterval.Get(&tf.settings.SV)): 217 } 218 } 219 } 220 221 func (tf *SchemaFeed) updateTableHistory(ctx context.Context, endTS hlc.Timestamp) error { 222 startTS := tf.highWater() 223 if endTS.LessEq(startTS) { 224 return nil 225 } 226 descs, err := fetchTableDescriptorVersions(ctx, tf.db, startTS, endTS, tf.targets) 227 if err != nil { 228 return err 229 } 230 return tf.ingestDescriptors(ctx, startTS, endTS, descs, tf.validateTable) 231 } 232 233 // Peek returns all events which have not been popped which happen at or 234 // before the passed timestamp. 235 func (tf *SchemaFeed) Peek( 236 ctx context.Context, atOrBefore hlc.Timestamp, 237 ) (events []TableEvent, err error) { 238 239 return tf.peekOrPop(ctx, atOrBefore, false /* pop */) 240 } 241 242 // Pop pops events from the EventQueue. 243 func (tf *SchemaFeed) Pop( 244 ctx context.Context, atOrBefore hlc.Timestamp, 245 ) (events []TableEvent, err error) { 246 return tf.peekOrPop(ctx, atOrBefore, true /* pop */) 247 } 248 249 func (tf *SchemaFeed) peekOrPop( 250 ctx context.Context, atOrBefore hlc.Timestamp, pop bool, 251 ) (events []TableEvent, err error) { 252 if err = tf.waitForTS(ctx, atOrBefore); err != nil { 253 return nil, err 254 } 255 tf.mu.Lock() 256 defer tf.mu.Unlock() 257 i := sort.Search(len(tf.mu.events), func(i int) bool { 258 return !tf.mu.events[i].Timestamp().LessEq(atOrBefore) 259 }) 260 if i == -1 { 261 i = 0 262 } 263 events = tf.mu.events[:i] 264 if pop { 265 tf.mu.events = tf.mu.events[i:] 266 } 267 return events, nil 268 } 269 270 // highWater returns the current high-water timestamp. 271 func (tf *SchemaFeed) highWater() hlc.Timestamp { 272 tf.mu.Lock() 273 highWater := tf.mu.highWater 274 tf.mu.Unlock() 275 return highWater 276 } 277 278 // waitForTS blocks until the given timestamp is less than or equal to the 279 // high-water or error timestamp. In the latter case, the error is returned. 280 // 281 // If called twice with the same timestamp, two different errors may be returned 282 // (since the error timestamp can recede). However, the return for a given 283 // timestamp will never switch from nil to an error or vice-versa (assuming that 284 // `validateFn` is deterministic and the ingested descriptors are read 285 // transactionally). 286 func (tf *SchemaFeed) waitForTS(ctx context.Context, ts hlc.Timestamp) error { 287 var errCh chan error 288 289 tf.mu.Lock() 290 highWater := tf.mu.highWater 291 var err error 292 if tf.mu.errTS != (hlc.Timestamp{}) && tf.mu.errTS.LessEq(ts) { 293 err = tf.mu.err 294 } 295 fastPath := err != nil || ts.LessEq(highWater) 296 if !fastPath { 297 errCh = make(chan error, 1) 298 tf.mu.waiters = append(tf.mu.waiters, tableHistoryWaiter{ts: ts, errCh: errCh}) 299 } 300 tf.mu.Unlock() 301 if fastPath { 302 if log.V(1) { 303 log.Infof(ctx, "fastpath for %s: %v", ts, err) 304 } 305 return err 306 } 307 308 if log.V(1) { 309 log.Infof(ctx, "waiting for %s highwater", ts) 310 } 311 start := timeutil.Now() 312 select { 313 case <-ctx.Done(): 314 return ctx.Err() 315 case err := <-errCh: 316 if log.V(1) { 317 log.Infof(ctx, "waited %s for %s highwater: %v", timeutil.Since(start), ts, err) 318 } 319 return err 320 } 321 } 322 323 func descLess(a, b *sqlbase.TableDescriptor) bool { 324 if a.ModificationTime.Equal(b.ModificationTime) { 325 return a.ID < b.ID 326 } 327 return a.ModificationTime.Less(b.ModificationTime) 328 } 329 330 // ingestDescriptors checks the given descriptors against the invariant check 331 // function and adjusts the high-water or error timestamp appropriately. It is 332 // required that the descriptors represent a transactional kv read between the 333 // two given timestamps. 334 // 335 // validateFn is exposed for testing, in production it is tf.validateTable. 336 func (tf *SchemaFeed) ingestDescriptors( 337 ctx context.Context, 338 startTS, endTS hlc.Timestamp, 339 descs []*sqlbase.TableDescriptor, 340 validateFn func(ctx context.Context, desc *sqlbase.TableDescriptor) error, 341 ) error { 342 sort.Slice(descs, func(i, j int) bool { return descLess(descs[i], descs[j]) }) 343 var validateErr error 344 for _, desc := range descs { 345 if err := validateFn(ctx, desc); validateErr == nil { 346 validateErr = err 347 } 348 } 349 return tf.adjustTimestamps(startTS, endTS, validateErr) 350 } 351 352 // adjustTimestamps adjusts the high-water or error timestamp appropriately. 353 func (tf *SchemaFeed) adjustTimestamps(startTS, endTS hlc.Timestamp, validateErr error) error { 354 tf.mu.Lock() 355 defer tf.mu.Unlock() 356 357 if validateErr != nil { 358 // don't care about startTS in the invalid case 359 if tf.mu.errTS == (hlc.Timestamp{}) || endTS.Less(tf.mu.errTS) { 360 tf.mu.errTS = endTS 361 tf.mu.err = validateErr 362 newWaiters := make([]tableHistoryWaiter, 0, len(tf.mu.waiters)) 363 for _, w := range tf.mu.waiters { 364 if w.ts.Less(tf.mu.errTS) { 365 newWaiters = append(newWaiters, w) 366 continue 367 } 368 w.errCh <- validateErr 369 } 370 tf.mu.waiters = newWaiters 371 } 372 return validateErr 373 } 374 375 if tf.mu.highWater.Less(startTS) { 376 return errors.Errorf(`gap between %s and %s`, tf.mu.highWater, startTS) 377 } 378 if tf.mu.highWater.Less(endTS) { 379 tf.mu.highWater = endTS 380 newWaiters := make([]tableHistoryWaiter, 0, len(tf.mu.waiters)) 381 for _, w := range tf.mu.waiters { 382 if tf.mu.highWater.Less(w.ts) { 383 newWaiters = append(newWaiters, w) 384 continue 385 } 386 w.errCh <- nil 387 } 388 tf.mu.waiters = newWaiters 389 } 390 return nil 391 } 392 func (e TableEvent) String() string { 393 return formatEvent(e) 394 } 395 396 func formatDesc(desc *sqlbase.TableDescriptor) string { 397 return fmt.Sprintf("%d:%d@%v", desc.ID, desc.Version, desc.ModificationTime) 398 } 399 400 func formatEvent(e TableEvent) string { 401 return fmt.Sprintf("%v->%v", formatDesc(e.Before), formatDesc(e.After)) 402 } 403 404 func (tf *SchemaFeed) validateTable(ctx context.Context, desc *sqlbase.TableDescriptor) error { 405 if err := changefeedbase.ValidateTable(tf.targets, desc); err != nil { 406 return err 407 } 408 tf.mu.Lock() 409 defer tf.mu.Unlock() 410 log.Infof(ctx, "validate %v", formatDesc(desc)) 411 if lastVersion, ok := tf.mu.previousTableVersion[desc.ID]; ok { 412 // NB: Writes can occur to a table 413 if desc.ModificationTime.LessEq(lastVersion.ModificationTime) { 414 return nil 415 } 416 417 // To avoid race conditions with the lease manager, at this point we force 418 // the manager to acquire the freshest descriptor of this table from the 419 // store. In normal operation, the lease manager returns the newest 420 // descriptor it knows about for the timestamp, assuming it's still 421 // allowed; without this explicit load, the lease manager might therefore 422 // return the previous version of the table, which is still technically 423 // allowed by the schema change system. 424 if err := tf.leaseMgr.AcquireFreshestFromStore(ctx, desc.ID); err != nil { 425 return err 426 } 427 428 e := TableEvent{ 429 Before: lastVersion, 430 After: desc, 431 } 432 shouldFilter, err := tf.filter.shouldFilter(ctx, e) 433 log.Infof(ctx, "validate shouldFilter %v %v", formatEvent(e), shouldFilter) 434 if err != nil { 435 return err 436 } 437 if !shouldFilter { 438 tf.mu.events = append(tf.mu.events, e) 439 sort.Slice(tf.mu.events, func(i, j int) bool { 440 return descLess(tf.mu.events[i].After, tf.mu.events[j].After) 441 }) 442 } 443 } 444 tf.mu.previousTableVersion[desc.ID] = desc 445 return nil 446 } 447 448 func fetchTableDescriptorVersions( 449 ctx context.Context, db *kv.DB, startTS, endTS hlc.Timestamp, targets jobspb.ChangefeedTargets, 450 ) ([]*sqlbase.TableDescriptor, error) { 451 if log.V(2) { 452 log.Infof(ctx, `fetching table descs (%s,%s]`, startTS, endTS) 453 } 454 start := timeutil.Now() 455 span := roachpb.Span{Key: keys.TODOSQLCodec.TablePrefix(keys.DescriptorTableID)} 456 span.EndKey = span.Key.PrefixEnd() 457 header := roachpb.Header{Timestamp: endTS} 458 req := &roachpb.ExportRequest{ 459 RequestHeader: roachpb.RequestHeaderFromSpan(span), 460 StartTime: startTS, 461 MVCCFilter: roachpb.MVCCFilter_All, 462 ReturnSST: true, 463 OmitChecksum: true, 464 } 465 res, pErr := kv.SendWrappedWith(ctx, db.NonTransactionalSender(), header, req) 466 if log.V(2) { 467 log.Infof(ctx, `fetched table descs (%s,%s] took %s`, startTS, endTS, timeutil.Since(start)) 468 } 469 if pErr != nil { 470 err := pErr.GoError() 471 return nil, errors.Wrapf(err, `fetching changes for %s`, span) 472 } 473 474 var tableDescs []*sqlbase.TableDescriptor 475 for _, file := range res.(*roachpb.ExportResponse).Files { 476 if err := func() error { 477 it, err := storage.NewMemSSTIterator(file.SST, false /* verify */) 478 if err != nil { 479 return err 480 } 481 defer it.Close() 482 for it.SeekGE(storage.NilKey); ; it.Next() { 483 if ok, err := it.Valid(); err != nil { 484 return err 485 } else if !ok { 486 return nil 487 } 488 k := it.UnsafeKey() 489 remaining, _, _, err := keys.TODOSQLCodec.DecodeIndexPrefix(k.Key) 490 if err != nil { 491 return err 492 } 493 _, tableID, err := encoding.DecodeUvarintAscending(remaining) 494 if err != nil { 495 return err 496 } 497 origName, ok := targets[sqlbase.ID(tableID)] 498 if !ok { 499 // Uninteresting table. 500 continue 501 } 502 unsafeValue := it.UnsafeValue() 503 if unsafeValue == nil { 504 return errors.Errorf(`"%v" was dropped or truncated`, origName) 505 } 506 value := roachpb.Value{RawBytes: unsafeValue} 507 var desc sqlbase.Descriptor 508 if err := value.GetProto(&desc); err != nil { 509 return err 510 } 511 if tableDesc := desc.Table(k.Timestamp); tableDesc != nil { 512 tableDescs = append(tableDescs, tableDesc) 513 } 514 } 515 }(); err != nil { 516 return nil, err 517 } 518 } 519 return tableDescs, nil 520 }