github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/ccl/changefeedccl/changefeed_processors.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Licensed as a CockroachDB Enterprise file under the Cockroach Community 4 // License (the "License"); you may not use this file except in compliance with 5 // the License. You may obtain a copy of the License at 6 // 7 // https://github.com/cockroachdb/cockroach/blob/master/licenses/CCL.txt 8 9 package changefeedccl 10 11 import ( 12 "context" 13 "fmt" 14 "math" 15 "time" 16 17 "github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/changefeedbase" 18 "github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/kvfeed" 19 "github.com/cockroachdb/cockroach/pkg/jobs" 20 "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" 21 "github.com/cockroachdb/cockroach/pkg/kv" 22 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts" 23 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/protectedts" 24 "github.com/cockroachdb/cockroach/pkg/roachpb" 25 "github.com/cockroachdb/cockroach/pkg/sql/catalog/lease" 26 "github.com/cockroachdb/cockroach/pkg/sql/execinfra" 27 "github.com/cockroachdb/cockroach/pkg/sql/execinfrapb" 28 "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode" 29 "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror" 30 "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" 31 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 32 "github.com/cockroachdb/cockroach/pkg/sql/types" 33 "github.com/cockroachdb/cockroach/pkg/util/hlc" 34 "github.com/cockroachdb/cockroach/pkg/util/log" 35 "github.com/cockroachdb/cockroach/pkg/util/mon" 36 "github.com/cockroachdb/cockroach/pkg/util/protoutil" 37 "github.com/cockroachdb/cockroach/pkg/util/span" 38 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 39 "github.com/cockroachdb/cockroach/pkg/util/uuid" 40 "github.com/cockroachdb/errors" 41 ) 42 43 type changeAggregator struct { 44 execinfra.ProcessorBase 45 46 flowCtx *execinfra.FlowCtx 47 spec execinfrapb.ChangeAggregatorSpec 48 memAcc mon.BoundAccount 49 50 // cancel shuts down the processor, both the `Next()` flow and the kvfeed. 51 cancel func() 52 // errCh contains the return values of the kvfeed. 53 errCh chan error 54 // kvFeedDoneCh is closed when the kvfeed exits. 55 kvFeedDoneCh chan struct{} 56 kvFeedMemMon *mon.BytesMonitor 57 58 // encoder is the Encoder to use for key and value serialization. 59 encoder Encoder 60 // sink is the Sink to write rows to. Resolved timestamps are never written 61 // by changeAggregator. 62 sink Sink 63 // tickFn is the workhorse behind Next(). It pulls kv changes from the 64 // buffer that poller fills, handles table leasing, converts them to rows, 65 // and writes them to the sink. 66 tickFn func(context.Context) ([]jobspb.ResolvedSpan, error) 67 // changedRowBuf, if non-nil, contains changed rows to be emitted. Anything 68 // queued in `resolvedSpanBuf` is dependent on these having been emitted, so 69 // this one must be empty before moving on to that one. 70 changedRowBuf *encDatumRowBuffer 71 // resolvedSpanBuf contains resolved span updates to send to changeFrontier. 72 // If sink is a bufferSink, it must be emptied before these are sent. 73 resolvedSpanBuf encDatumRowBuffer 74 } 75 76 type timestampLowerBoundOracle interface { 77 inclusiveLowerBoundTS() hlc.Timestamp 78 } 79 80 type changeAggregatorLowerBoundOracle struct { 81 sf *span.Frontier 82 initialInclusiveLowerBound hlc.Timestamp 83 } 84 85 // inclusiveLowerBoundTs is used to generate a representative timestamp to name 86 // cloudStorageSink files. This timestamp is either the statement time (in case this 87 // changefeed job hasn't yet seen any resolved timestamps) or the successor timestamp to 88 // the local span frontier. This convention is chosen to preserve CDC's ordering 89 // guarantees. See comment on cloudStorageSink for more details. 90 func (o *changeAggregatorLowerBoundOracle) inclusiveLowerBoundTS() hlc.Timestamp { 91 if frontier := o.sf.Frontier(); !frontier.IsEmpty() { 92 // We call `Next()` here on the frontier because this allows us 93 // to name files using a timestamp that is an inclusive lower bound 94 // on the timestamps of the updates contained within the file. 95 // Files being created at the point this method is called are guaranteed 96 // to contain row updates with timestamps strictly greater than the local 97 // span frontier timestamp. 98 return frontier.Next() 99 } 100 // This should only be returned in the case where the changefeed job hasn't yet 101 // seen a resolved timestamp. 102 return o.initialInclusiveLowerBound 103 } 104 105 var _ execinfra.Processor = &changeAggregator{} 106 var _ execinfra.RowSource = &changeAggregator{} 107 108 func newChangeAggregatorProcessor( 109 flowCtx *execinfra.FlowCtx, 110 processorID int32, 111 spec execinfrapb.ChangeAggregatorSpec, 112 output execinfra.RowReceiver, 113 ) (execinfra.Processor, error) { 114 ctx := flowCtx.EvalCtx.Ctx() 115 memMonitor := execinfra.NewMonitor(ctx, flowCtx.EvalCtx.Mon, "changeagg-mem") 116 ca := &changeAggregator{ 117 flowCtx: flowCtx, 118 spec: spec, 119 memAcc: memMonitor.MakeBoundAccount(), 120 } 121 if err := ca.Init( 122 ca, 123 &execinfrapb.PostProcessSpec{}, 124 nil, /* types */ 125 flowCtx, 126 processorID, 127 output, 128 memMonitor, 129 execinfra.ProcStateOpts{ 130 TrailingMetaCallback: func(context.Context) []execinfrapb.ProducerMetadata { 131 ca.close() 132 return nil 133 }, 134 }, 135 ); err != nil { 136 return nil, err 137 } 138 139 var err error 140 if ca.encoder, err = getEncoder(ca.spec.Feed.Opts); err != nil { 141 return nil, err 142 } 143 144 return ca, nil 145 } 146 147 func (ca *changeAggregator) OutputTypes() []*types.T { 148 return changefeedResultTypes 149 } 150 151 // Start is part of the RowSource interface. 152 func (ca *changeAggregator) Start(ctx context.Context) context.Context { 153 ctx, ca.cancel = context.WithCancel(ctx) 154 // StartInternal called at the beginning of the function because there are 155 // early returns if errors are detected. 156 ctx = ca.StartInternal(ctx, changeAggregatorProcName) 157 158 spans, sf := ca.setupSpans() 159 timestampOracle := &changeAggregatorLowerBoundOracle{sf: sf, initialInclusiveLowerBound: ca.spec.Feed.StatementTime} 160 nodeID, err := ca.flowCtx.EvalCtx.NodeID.OptionalNodeIDErr(48274) 161 if err != nil { 162 ca.MoveToDraining(err) 163 return ctx 164 } 165 166 if ca.sink, err = getSink( 167 ctx, ca.spec.Feed.SinkURI, nodeID, ca.spec.Feed.Opts, ca.spec.Feed.Targets, 168 ca.flowCtx.Cfg.Settings, timestampOracle, ca.flowCtx.Cfg.ExternalStorageFromURI, 169 ); err != nil { 170 err = MarkRetryableError(err) 171 // Early abort in the case that there is an error creating the sink. 172 ca.MoveToDraining(err) 173 ca.cancel() 174 return ctx 175 } 176 177 // This is the correct point to set up certain hooks depending on the sink 178 // type. 179 if b, ok := ca.sink.(*bufferSink); ok { 180 ca.changedRowBuf = &b.buf 181 } 182 183 // The job registry has a set of metrics used to monitor the various jobs it 184 // runs. They're all stored as the `metric.Struct` interface because of 185 // dependency cycles. 186 metrics := ca.flowCtx.Cfg.JobRegistry.MetricsStruct().Changefeed.(*Metrics) 187 ca.sink = makeMetricsSink(metrics, ca.sink) 188 ca.sink = &errorWrapperSink{wrapped: ca.sink} 189 190 var knobs TestingKnobs 191 if cfKnobs, ok := ca.flowCtx.TestingKnobs().Changefeed.(*TestingKnobs); ok { 192 knobs = *cfKnobs 193 } 194 195 // It seems like we should also be able to use `ca.ProcessorBase.MemMonitor` 196 // for the poller, but there is a race between the flow's MemoryMonitor 197 // getting Stopped and `changeAggregator.Close`, which causes panics. Not sure 198 // what to do about this yet. 199 kvFeedMemMonCapacity := kvfeed.MemBufferDefaultCapacity 200 if knobs.MemBufferCapacity != 0 { 201 kvFeedMemMonCapacity = knobs.MemBufferCapacity 202 } 203 kvFeedMemMon := mon.MakeMonitorInheritWithLimit("kvFeed", math.MaxInt64, ca.ProcessorBase.MemMonitor) 204 kvFeedMemMon.Start(ctx, nil /* pool */, mon.MakeStandaloneBudget(kvFeedMemMonCapacity)) 205 ca.kvFeedMemMon = &kvFeedMemMon 206 207 buf := kvfeed.MakeChanBuffer() 208 leaseMgr := ca.flowCtx.Cfg.LeaseManager.(*lease.Manager) 209 _, withDiff := ca.spec.Feed.Opts[changefeedbase.OptDiff] 210 kvfeedCfg := makeKVFeedCfg(ca.flowCtx.Cfg, leaseMgr, ca.kvFeedMemMon, ca.spec, 211 spans, withDiff, buf, metrics) 212 rowsFn := kvsToRows(ca.flowCtx.Codec(), leaseMgr, ca.spec.Feed, buf.Get) 213 ca.tickFn = emitEntries(ca.flowCtx.Cfg.Settings, ca.spec.Feed, 214 kvfeedCfg.InitialHighWater, sf, ca.encoder, ca.sink, rowsFn, knobs, metrics) 215 ca.startKVFeed(ctx, kvfeedCfg) 216 217 return ctx 218 } 219 220 func (ca *changeAggregator) startKVFeed(ctx context.Context, kvfeedCfg kvfeed.Config) { 221 // Give errCh enough buffer both possible errors from supporting goroutines, 222 // but only the first one is ever used. 223 ca.errCh = make(chan error, 2) 224 ca.kvFeedDoneCh = make(chan struct{}) 225 if err := ca.flowCtx.Stopper().RunAsyncTask(ctx, "changefeed-poller", func(ctx context.Context) { 226 defer close(ca.kvFeedDoneCh) 227 // Trying to call MoveToDraining here is racy (`MoveToDraining called in 228 // state stateTrailingMeta`), so return the error via a channel. 229 ca.errCh <- kvfeed.Run(ctx, kvfeedCfg) 230 ca.cancel() 231 }); err != nil { 232 // If err != nil then the RunAsyncTask closure never ran, which means we 233 // need to manually close ca.kvFeedDoneCh so `(*changeAggregator).close` 234 // doesn't hang. 235 close(ca.kvFeedDoneCh) 236 ca.errCh <- err 237 ca.cancel() 238 } 239 } 240 241 func makeKVFeedCfg( 242 cfg *execinfra.ServerConfig, 243 leaseMgr *lease.Manager, 244 mm *mon.BytesMonitor, 245 spec execinfrapb.ChangeAggregatorSpec, 246 spans []roachpb.Span, 247 withDiff bool, 248 buf kvfeed.EventBuffer, 249 metrics *Metrics, 250 ) kvfeed.Config { 251 schemaChangeEvents := changefeedbase.SchemaChangeEventClass( 252 spec.Feed.Opts[changefeedbase.OptSchemaChangeEvents]) 253 schemaChangePolicy := changefeedbase.SchemaChangePolicy( 254 spec.Feed.Opts[changefeedbase.OptSchemaChangePolicy]) 255 initialHighWater, needsInitialScan := getKVFeedInitialParameters(spec) 256 kvfeedCfg := kvfeed.Config{ 257 Sink: buf, 258 Settings: cfg.Settings, 259 DB: cfg.DB, 260 Clock: cfg.DB.Clock(), 261 Gossip: cfg.Gossip, 262 Spans: spans, 263 Targets: spec.Feed.Targets, 264 LeaseMgr: leaseMgr, 265 Metrics: &metrics.KVFeedMetrics, 266 MM: mm, 267 InitialHighWater: initialHighWater, 268 WithDiff: withDiff, 269 NeedsInitialScan: needsInitialScan, 270 SchemaChangeEvents: schemaChangeEvents, 271 SchemaChangePolicy: schemaChangePolicy, 272 } 273 return kvfeedCfg 274 } 275 276 // getKVFeedInitialParameters determines the starting timestamp for the kv and 277 // whether or not an initial scan is needed. The need for an initial scan is 278 // determined by whether the watched in the spec have a resolved timestamp. The 279 // higher layers mark each watch with the checkpointed resolved timestamp if no 280 // initial scan is needed. 281 // 282 // TODO(ajwerner): Utilize this partial checkpointing, especially in the face of 283 // of logical backfills of a single table while progress is made on others or 284 // get rid of it. See https://github.com/cockroachdb/cockroach/issues/43896. 285 func getKVFeedInitialParameters( 286 spec execinfrapb.ChangeAggregatorSpec, 287 ) (initialHighWater hlc.Timestamp, needsInitialScan bool) { 288 for _, watch := range spec.Watches { 289 if initialHighWater.IsEmpty() || watch.InitialResolved.Less(initialHighWater) { 290 initialHighWater = watch.InitialResolved 291 } 292 } 293 // This will be true in the case where we have no cursor and we've never 294 // checkpointed a resolved timestamp or we have a cursor but we want an 295 // initial scan. The higher levels will coordinate that we only have empty 296 // watches when we need an initial scan. 297 if needsInitialScan = initialHighWater.IsEmpty(); needsInitialScan { 298 initialHighWater = spec.Feed.StatementTime 299 } 300 return initialHighWater, needsInitialScan 301 } 302 303 // setupSpans is called on start to extract the spans for this changefeed as a 304 // slice and creates a span frontier with the initial resolved timestampsc. This 305 // SpanFrontier only tracks the spans being watched on this node. There is a 306 // different SpanFrontier elsewhere for the entire changefeed. This object is 307 // used to filter out some previously emitted rows, and by the cloudStorageSink 308 // to name its output files in lexicographically monotonic fashion. 309 func (ca *changeAggregator) setupSpans() ([]roachpb.Span, *span.Frontier) { 310 spans := make([]roachpb.Span, 0, len(ca.spec.Watches)) 311 for _, watch := range ca.spec.Watches { 312 spans = append(spans, watch.Span) 313 } 314 sf := span.MakeFrontier(spans...) 315 for _, watch := range ca.spec.Watches { 316 sf.Forward(watch.Span, watch.InitialResolved) 317 } 318 return spans, sf 319 } 320 321 // close has two purposes: to synchronize on the completion of the helper 322 // goroutines created by the Start method, and to clean up any resources used by 323 // the processor. Due to the fact that this method may be called even if the 324 // processor did not finish completion, there is an excessive amount of nil 325 // checking. 326 func (ca *changeAggregator) close() { 327 if ca.InternalClose() { 328 // Shut down the poller if it wasn't already. 329 if ca.cancel != nil { 330 ca.cancel() 331 } 332 // Wait for the poller to finish shutting down. 333 if ca.kvFeedDoneCh != nil { 334 <-ca.kvFeedDoneCh 335 } 336 if ca.sink != nil { 337 if err := ca.sink.Close(); err != nil { 338 log.Warningf(ca.Ctx, `error closing sink. goroutines may have leaked: %v`, err) 339 } 340 } 341 ca.memAcc.Close(ca.Ctx) 342 if ca.kvFeedMemMon != nil { 343 ca.kvFeedMemMon.Stop(ca.Ctx) 344 } 345 ca.MemMonitor.Stop(ca.Ctx) 346 } 347 } 348 349 // Next is part of the RowSource interface. 350 func (ca *changeAggregator) Next() (sqlbase.EncDatumRow, *execinfrapb.ProducerMetadata) { 351 for ca.State == execinfra.StateRunning { 352 if !ca.changedRowBuf.IsEmpty() { 353 return ca.changedRowBuf.Pop(), nil 354 } else if !ca.resolvedSpanBuf.IsEmpty() { 355 return ca.resolvedSpanBuf.Pop(), nil 356 } 357 if err := ca.tick(); err != nil { 358 select { 359 // If the poller errored first, that's the 360 // interesting one, so overwrite `err`. 361 case err = <-ca.errCh: 362 default: 363 } 364 // Shut down the poller if it wasn't already. 365 ca.cancel() 366 367 ca.MoveToDraining(err) 368 break 369 } 370 } 371 return nil, ca.DrainHelper() 372 } 373 374 func (ca *changeAggregator) tick() error { 375 resolvedSpans, err := ca.tickFn(ca.Ctx) 376 if err != nil { 377 return err 378 } 379 380 for _, resolvedSpan := range resolvedSpans { 381 resolvedBytes, err := protoutil.Marshal(&resolvedSpan) 382 if err != nil { 383 return err 384 } 385 // Enqueue a row to be returned that indicates some span-level resolved 386 // timestamp has advanced. If any rows were queued in `sink`, they must 387 // be emitted first. 388 ca.resolvedSpanBuf.Push(sqlbase.EncDatumRow{ 389 sqlbase.EncDatum{Datum: tree.NewDBytes(tree.DBytes(resolvedBytes))}, 390 sqlbase.EncDatum{Datum: tree.DNull}, // topic 391 sqlbase.EncDatum{Datum: tree.DNull}, // key 392 sqlbase.EncDatum{Datum: tree.DNull}, // value 393 }) 394 } 395 return nil 396 } 397 398 // ConsumerDone is part of the RowSource interface. 399 func (ca *changeAggregator) ConsumerDone() { 400 ca.MoveToDraining(nil /* err */) 401 } 402 403 // ConsumerClosed is part of the RowSource interface. 404 func (ca *changeAggregator) ConsumerClosed() { 405 // The consumer is done, Next() will not be called again. 406 ca.InternalClose() 407 } 408 409 const ( 410 emitAllResolved = 0 411 emitNoResolved = -1 412 ) 413 414 type changeFrontier struct { 415 execinfra.ProcessorBase 416 417 flowCtx *execinfra.FlowCtx 418 spec execinfrapb.ChangeFrontierSpec 419 memAcc mon.BoundAccount 420 a sqlbase.DatumAlloc 421 422 // input returns rows from one or more changeAggregator processors 423 input execinfra.RowSource 424 425 // sf contains the current resolved timestamp high-water for the tracked 426 // span set. 427 sf *span.Frontier 428 // encoder is the Encoder to use for resolved timestamp serialization. 429 encoder Encoder 430 // sink is the Sink to write resolved timestamps to. Rows are never written 431 // by changeFrontier. 432 sink Sink 433 // freqEmitResolved, if >= 0, is a lower bound on the duration between 434 // resolved timestamp emits. 435 freqEmitResolved time.Duration 436 // lastEmitResolved is the last time a resolved timestamp was emitted. 437 lastEmitResolved time.Time 438 // lastSlowSpanLog is the last time a slow span from `sf` was logged. 439 lastSlowSpanLog time.Time 440 441 // schemaChangeBoundary represents an hlc timestamp at which a schema change 442 // event occurred to a target watched by this frontier. If the changefeed is 443 // configured to stop on schema change then the changeFrontier will wait for 444 // the span frontier to reach the schemaChangeBoundary, will drain, and then 445 // will exit. If the changefeed is configured to backfill on schema changes, 446 // the changeFrontier will protect the scan timestamp in order to ensure that 447 // the scan complete. The protected timestamp will be released when a new scan 448 // schemaChangeBoundary is created or the changefeed reaches a timestamp that 449 // is near the present. 450 // 451 // schemaChangeBoundary values are communicated to the changeFrontier via 452 // Resolved messages send from the changeAggregators. The policy regarding 453 // which schema change events lead to a schemaChangeBoundary is controlled 454 // by the KV feed based on OptSchemaChangeEvents and OptSchemaChangePolicy. 455 schemaChangeBoundary hlc.Timestamp 456 457 // jobProgressedFn, if non-nil, is called to checkpoint the changefeed's 458 // progress in the corresponding system job entry. 459 jobProgressedFn func(context.Context, jobs.HighWaterProgressedFn) error 460 // highWaterAtStart is the greater of the job high-water and the timestamp the 461 // CHANGEFEED statement was run at. It's used in an assertion that we never 462 // regress the job high-water. 463 highWaterAtStart hlc.Timestamp 464 // passthroughBuf, in some but not all flows, contains changed row data to 465 // pass through unchanged to the gateway node. 466 passthroughBuf encDatumRowBuffer 467 // resolvedBuf, if non-nil, contains rows indicating a changefeed-level 468 // resolved timestamp to be returned. It depends on everything in 469 // `passthroughBuf` being sent, so that one needs to be emptied first. 470 resolvedBuf *encDatumRowBuffer 471 // metrics are monitoring counters shared between all changefeeds. 472 metrics *Metrics 473 // metricsID is used as the unique id of this changefeed in the 474 // metrics.MaxBehindNanos map. 475 metricsID int 476 } 477 478 var _ execinfra.Processor = &changeFrontier{} 479 var _ execinfra.RowSource = &changeFrontier{} 480 481 func newChangeFrontierProcessor( 482 flowCtx *execinfra.FlowCtx, 483 processorID int32, 484 spec execinfrapb.ChangeFrontierSpec, 485 input execinfra.RowSource, 486 output execinfra.RowReceiver, 487 ) (execinfra.Processor, error) { 488 ctx := flowCtx.EvalCtx.Ctx() 489 memMonitor := execinfra.NewMonitor(ctx, flowCtx.EvalCtx.Mon, "changefntr-mem") 490 cf := &changeFrontier{ 491 flowCtx: flowCtx, 492 spec: spec, 493 memAcc: memMonitor.MakeBoundAccount(), 494 input: input, 495 sf: span.MakeFrontier(spec.TrackedSpans...), 496 } 497 if err := cf.Init( 498 cf, &execinfrapb.PostProcessSpec{}, 499 input.OutputTypes(), 500 flowCtx, 501 processorID, 502 output, 503 memMonitor, 504 execinfra.ProcStateOpts{ 505 TrailingMetaCallback: func(context.Context) []execinfrapb.ProducerMetadata { 506 cf.close() 507 return nil 508 }, 509 InputsToDrain: []execinfra.RowSource{cf.input}, 510 }, 511 ); err != nil { 512 return nil, err 513 } 514 515 if r, ok := cf.spec.Feed.Opts[changefeedbase.OptResolvedTimestamps]; ok { 516 var err error 517 if r == `` { 518 // Empty means emit them as often as we have them. 519 cf.freqEmitResolved = emitAllResolved 520 } else if cf.freqEmitResolved, err = time.ParseDuration(r); err != nil { 521 return nil, err 522 } 523 } else { 524 cf.freqEmitResolved = emitNoResolved 525 } 526 527 var err error 528 if cf.encoder, err = getEncoder(spec.Feed.Opts); err != nil { 529 return nil, err 530 } 531 532 return cf, nil 533 } 534 535 func (cf *changeFrontier) OutputTypes() []*types.T { 536 return changefeedResultTypes 537 } 538 539 // Start is part of the RowSource interface. 540 func (cf *changeFrontier) Start(ctx context.Context) context.Context { 541 cf.input.Start(ctx) 542 543 // StartInternal called at the beginning of the function because there are 544 // early returns if errors are detected. 545 ctx = cf.StartInternal(ctx, changeFrontierProcName) 546 547 nodeID, err := cf.flowCtx.EvalCtx.NodeID.OptionalNodeIDErr(48274) 548 if err != nil { 549 cf.MoveToDraining(err) 550 return ctx 551 } 552 // Pass a nil oracle because this sink is only used to emit resolved timestamps 553 // but the oracle is only used when emitting row updates. 554 var nilOracle timestampLowerBoundOracle 555 if cf.sink, err = getSink( 556 ctx, cf.spec.Feed.SinkURI, nodeID, cf.spec.Feed.Opts, cf.spec.Feed.Targets, 557 cf.flowCtx.Cfg.Settings, nilOracle, cf.flowCtx.Cfg.ExternalStorageFromURI, 558 ); err != nil { 559 err = MarkRetryableError(err) 560 cf.MoveToDraining(err) 561 return ctx 562 } 563 564 if b, ok := cf.sink.(*bufferSink); ok { 565 cf.resolvedBuf = &b.buf 566 } 567 568 // The job registry has a set of metrics used to monitor the various jobs it 569 // runs. They're all stored as the `metric.Struct` interface because of 570 // dependency cycles. 571 cf.metrics = cf.flowCtx.Cfg.JobRegistry.MetricsStruct().Changefeed.(*Metrics) 572 cf.sink = makeMetricsSink(cf.metrics, cf.sink) 573 cf.sink = &errorWrapperSink{wrapped: cf.sink} 574 575 cf.highWaterAtStart = cf.spec.Feed.StatementTime 576 if cf.spec.JobID != 0 { 577 job, err := cf.flowCtx.Cfg.JobRegistry.LoadJob(ctx, cf.spec.JobID) 578 if err != nil { 579 cf.MoveToDraining(err) 580 return ctx 581 } 582 cf.jobProgressedFn = job.HighWaterProgressed 583 584 p := job.Progress() 585 if ts := p.GetHighWater(); ts != nil { 586 cf.highWaterAtStart.Forward(*ts) 587 } 588 } 589 590 cf.metrics.mu.Lock() 591 cf.metricsID = cf.metrics.mu.id 592 cf.metrics.mu.id++ 593 cf.metrics.mu.Unlock() 594 // TODO(dan): It's very important that we de-register from the metric because 595 // if we orphan an entry in there, our monitoring will lie (say the changefeed 596 // is behind when it may not be). We call this in `close` but that doesn't 597 // always get called when the processor is shut down (especially during crdb 598 // chaos), so here's something that maybe will work some of the times that 599 // close doesn't. This is all very hacky. The real answer is to fix whatever 600 // bugs currently exist in processor shutdown. 601 go func() { 602 <-ctx.Done() 603 cf.closeMetrics() 604 }() 605 606 return ctx 607 } 608 609 func (cf *changeFrontier) close() { 610 if cf.InternalClose() { 611 if cf.metrics != nil { 612 cf.closeMetrics() 613 } 614 if cf.sink != nil { 615 if err := cf.sink.Close(); err != nil { 616 log.Warningf(cf.Ctx, `error closing sink. goroutines may have leaked: %v`, err) 617 } 618 } 619 cf.memAcc.Close(cf.Ctx) 620 cf.MemMonitor.Stop(cf.Ctx) 621 } 622 } 623 624 // closeMetrics de-registers from the progress registry that powers 625 // `changefeed.max_behind_nanos`. This method is idempotent. 626 func (cf *changeFrontier) closeMetrics() { 627 // Delete this feed from the MaxBehindNanos metric so it's no longer 628 // considered by the gauge. 629 cf.metrics.mu.Lock() 630 delete(cf.metrics.mu.resolved, cf.metricsID) 631 cf.metricsID = -1 632 cf.metrics.mu.Unlock() 633 } 634 635 // schemaChangeBoundaryReached returns true if the spanFrontier is at the 636 // current schemaChangeBoundary. 637 func (cf *changeFrontier) schemaChangeBoundaryReached() (r bool) { 638 return !cf.schemaChangeBoundary.IsEmpty() && cf.schemaChangeBoundary.Equal(cf.sf.Frontier()) 639 } 640 641 // shouldFailOnSchemaChange checks the job's spec to determine whether it should 642 // failed on schema change events after all spans have been resolved. 643 func (cf *changeFrontier) shouldFailOnSchemaChange() bool { 644 policy := changefeedbase.SchemaChangePolicy(cf.spec.Feed.Opts[changefeedbase.OptSchemaChangePolicy]) 645 return policy == changefeedbase.OptSchemaChangePolicyStop 646 } 647 648 // shouldFailOnSchemaChange checks the job's spec to determine whether it should 649 // install protected timestamps when encountering scan boundaries. 650 func (cf *changeFrontier) shouldProtectBoundaries() bool { 651 policy := changefeedbase.SchemaChangePolicy(cf.spec.Feed.Opts[changefeedbase.OptSchemaChangePolicy]) 652 return policy == changefeedbase.OptSchemaChangePolicyBackfill 653 } 654 655 // Next is part of the RowSource interface. 656 func (cf *changeFrontier) Next() (sqlbase.EncDatumRow, *execinfrapb.ProducerMetadata) { 657 for cf.State == execinfra.StateRunning { 658 if !cf.passthroughBuf.IsEmpty() { 659 return cf.passthroughBuf.Pop(), nil 660 } else if !cf.resolvedBuf.IsEmpty() { 661 return cf.resolvedBuf.Pop(), nil 662 } 663 664 if cf.schemaChangeBoundaryReached() && cf.shouldFailOnSchemaChange() { 665 // TODO(ajwerner): make this more useful by at least informing the client 666 // of which tables changed. 667 cf.MoveToDraining(pgerror.Newf(pgcode.SchemaChangeOccurred, 668 "schema change occurred at %v", cf.schemaChangeBoundary.Next().AsOfSystemTime())) 669 break 670 } 671 672 row, meta := cf.input.Next() 673 if meta != nil { 674 if meta.Err != nil { 675 cf.MoveToDraining(nil /* err */) 676 } 677 return nil, meta 678 } 679 if row == nil { 680 cf.MoveToDraining(nil /* err */) 681 break 682 } 683 684 if row[0].IsNull() { 685 // In changefeeds with a sink, this will never happen. But in the 686 // core changefeed, which returns changed rows directly via pgwire, 687 // a row with a null resolved_span field is a changed row that needs 688 // to be forwarded to the gateway. 689 cf.passthroughBuf.Push(row) 690 continue 691 } 692 693 if err := cf.noteResolvedSpan(row[0]); err != nil { 694 cf.MoveToDraining(err) 695 break 696 } 697 } 698 return nil, cf.DrainHelper() 699 } 700 701 func (cf *changeFrontier) noteResolvedSpan(d sqlbase.EncDatum) error { 702 if err := d.EnsureDecoded(changefeedResultTypes[0], &cf.a); err != nil { 703 return err 704 } 705 raw, ok := d.Datum.(*tree.DBytes) 706 if !ok { 707 return errors.AssertionFailedf(`unexpected datum type %T: %s`, d.Datum, d.Datum) 708 } 709 var resolved jobspb.ResolvedSpan 710 if err := protoutil.Unmarshal([]byte(*raw), &resolved); err != nil { 711 return errors.NewAssertionErrorWithWrappedErrf(err, 712 `unmarshalling resolved span: %x`, raw) 713 } 714 715 // Inserting a timestamp less than the one the changefeed flow started at 716 // could potentially regress the job progress. This is not expected, but it 717 // was a bug at one point, so assert to prevent regressions. 718 // 719 // TODO(dan): This is much more naturally expressed as an assert inside the 720 // job progress update closure, but it currently doesn't pass along the info 721 // we'd need to do it that way. 722 if !resolved.Timestamp.IsEmpty() && resolved.Timestamp.Less(cf.highWaterAtStart) { 723 log.ReportOrPanic(cf.Ctx, &cf.flowCtx.Cfg.Settings.SV, 724 `got a span level timestamp %s for %s that is less than the initial high-water %s`, 725 log.Safe(resolved.Timestamp), resolved.Span, log.Safe(cf.highWaterAtStart)) 726 return nil 727 } 728 729 // We want to ensure that we mark the schemaChangeBoundary and then we want to detect when 730 // the frontier reaches to or past the schemaChangeBoundary. 731 if resolved.BoundaryReached && (cf.schemaChangeBoundary.IsEmpty() || resolved.Timestamp.Less(cf.schemaChangeBoundary)) { 732 cf.schemaChangeBoundary = resolved.Timestamp 733 } 734 // If we've moved past a schemaChangeBoundary, make sure to clear it. 735 if !resolved.BoundaryReached && !cf.schemaChangeBoundary.IsEmpty() && cf.schemaChangeBoundary.Less(resolved.Timestamp) { 736 cf.schemaChangeBoundary = hlc.Timestamp{} 737 } 738 739 frontierChanged := cf.sf.Forward(resolved.Span, resolved.Timestamp) 740 isBehind := cf.maybeLogBehindSpan(frontierChanged) 741 if frontierChanged { 742 if err := cf.handleFrontierChanged(isBehind); err != nil { 743 return err 744 } 745 } 746 return nil 747 } 748 749 func (cf *changeFrontier) handleFrontierChanged(isBehind bool) error { 750 newResolved := cf.sf.Frontier() 751 cf.metrics.mu.Lock() 752 if cf.metricsID != -1 { 753 cf.metrics.mu.resolved[cf.metricsID] = newResolved 754 } 755 cf.metrics.mu.Unlock() 756 if err := cf.checkpointResolvedTimestamp(newResolved, isBehind); err != nil { 757 return err 758 } 759 if err := cf.maybeEmitResolved(newResolved); err != nil { 760 return err 761 } 762 return nil 763 } 764 765 // checkpointResolvedTimestamp checkpoints a changefeed-level resolved timestamp 766 // to the jobs record. It additionally manages the protected timestamp state 767 // which is stored in the job progress details. It is only called if the new 768 // resolved timestamp is later than the current one. The isBehind argument is 769 // used to determine whether an existing protected timestamp should be released. 770 func (cf *changeFrontier) checkpointResolvedTimestamp( 771 resolved hlc.Timestamp, isBehind bool, 772 ) (err error) { 773 // NB: Sinkless changefeeds will not have a jobProgressedFn. In fact, they 774 // have no distributed state whatsoever. Because of this they also do not 775 // use protected timestamps. 776 if cf.jobProgressedFn == nil { 777 return nil 778 } 779 return cf.jobProgressedFn(cf.Ctx, func( 780 ctx context.Context, txn *kv.Txn, details jobspb.ProgressDetails, 781 ) (hlc.Timestamp, error) { 782 progress := details.(*jobspb.Progress_Changefeed).Changefeed 783 if err := cf.manageProtectedTimestamps(ctx, progress, txn, resolved, isBehind); err != nil { 784 return hlc.Timestamp{}, err 785 } 786 return resolved, nil 787 }) 788 } 789 790 // manageProtectedTimestamps is called when the resolved timestamp is being 791 // checkpointed. The changeFrontier always checkpoints resolved timestamps 792 // which occur at scan boundaries. It releases previously protected timestamps 793 // if the changefeed is not behind. See maybeLogBehindSpan for details on the 794 // behind calculation. 795 // 796 // Note that this function is never called for sinkless changefeeds as they have 797 // no corresponding job and thus no corresponding distributed state on which to 798 // attach protected timestamp information. 799 // 800 // TODO(ajwerner): Adopt protected timestamps for sinkless changefeeds, 801 // perhaps by using whatever mechanism is eventually built to protect 802 // data for long-running SQL transactions. There's some discussion of this 803 // use case in the protected timestamps RFC. 804 func (cf *changeFrontier) manageProtectedTimestamps( 805 ctx context.Context, 806 progress *jobspb.ChangefeedProgress, 807 txn *kv.Txn, 808 resolved hlc.Timestamp, 809 isBehind bool, 810 ) error { 811 pts := cf.flowCtx.Cfg.ProtectedTimestampProvider 812 if err := cf.maybeReleaseProtectedTimestamp(ctx, progress, pts, txn, isBehind); err != nil { 813 return err 814 } 815 return cf.maybeProtectTimestamp(ctx, progress, pts, txn, resolved) 816 } 817 818 // maybeReleaseProtectedTimestamp will release the current protected timestamp 819 // if either the resolved timestamp is close to the present or we've reached 820 // a new schemaChangeBoundary which will be protected. 821 func (cf *changeFrontier) maybeReleaseProtectedTimestamp( 822 ctx context.Context, 823 progress *jobspb.ChangefeedProgress, 824 pts protectedts.Storage, 825 txn *kv.Txn, 826 isBehind bool, 827 ) error { 828 if progress.ProtectedTimestampRecord == uuid.Nil { 829 return nil 830 } 831 if !cf.schemaChangeBoundaryReached() && isBehind { 832 log.VEventf(ctx, 2, "not releasing protected timestamp because changefeed is behind") 833 return nil 834 } 835 log.VEventf(ctx, 2, "releasing protected timestamp %v", 836 progress.ProtectedTimestampRecord) 837 if err := pts.Release(ctx, txn, progress.ProtectedTimestampRecord); err != nil { 838 return err 839 } 840 progress.ProtectedTimestampRecord = uuid.Nil 841 return nil 842 } 843 844 // maybeProtectTimestamp creates a new protected timestamp when the 845 // changeFrontier reaches a scanBoundary and the schemaChangePolicy indicates 846 // that we should perform a backfill (see cf.shouldProtectBoundaries()). 847 func (cf *changeFrontier) maybeProtectTimestamp( 848 ctx context.Context, 849 progress *jobspb.ChangefeedProgress, 850 pts protectedts.Storage, 851 txn *kv.Txn, 852 resolved hlc.Timestamp, 853 ) error { 854 if cf.isSinkless() || !cf.schemaChangeBoundaryReached() || !cf.shouldProtectBoundaries() { 855 return nil 856 } 857 858 jobID := cf.spec.JobID 859 targets := cf.spec.Feed.Targets 860 return createProtectedTimestampRecord(ctx, pts, txn, jobID, targets, resolved, progress) 861 } 862 863 func (cf *changeFrontier) maybeEmitResolved(newResolved hlc.Timestamp) error { 864 if cf.freqEmitResolved == emitNoResolved { 865 return nil 866 } 867 sinceEmitted := newResolved.GoTime().Sub(cf.lastEmitResolved) 868 shouldEmit := sinceEmitted >= cf.freqEmitResolved || cf.schemaChangeBoundaryReached() 869 if !shouldEmit { 870 return nil 871 } 872 // Keeping this after the checkpointResolvedTimestamp call will avoid 873 // some duplicates if a restart happens. 874 if err := emitResolvedTimestamp(cf.Ctx, cf.encoder, cf.sink, newResolved); err != nil { 875 return err 876 } 877 cf.lastEmitResolved = newResolved.GoTime() 878 return nil 879 } 880 881 // Potentially log the most behind span in the frontier for debugging. The 882 // returned boolean will be true if the resolved timestamp lags far behind the 883 // present as defined by the current configuration. 884 func (cf *changeFrontier) maybeLogBehindSpan(frontierChanged bool) (isBehind bool) { 885 // These two cluster setting values represent the target responsiveness of 886 // poller and range feed. The cluster setting for switching between poller and 887 // rangefeed is only checked at changefeed start/resume, so instead of 888 // switching on it here, just add them. Also add 1 second in case both these 889 // settings are set really low (as they are in unit tests). 890 pollInterval := changefeedbase.TableDescriptorPollInterval.Get(&cf.flowCtx.Cfg.Settings.SV) 891 closedtsInterval := closedts.TargetDuration.Get(&cf.flowCtx.Cfg.Settings.SV) 892 slownessThreshold := time.Second + 10*(pollInterval+closedtsInterval) 893 frontier := cf.sf.Frontier() 894 now := timeutil.Now() 895 resolvedBehind := now.Sub(frontier.GoTime()) 896 if resolvedBehind <= slownessThreshold { 897 return false 898 } 899 900 description := `sinkless feed` 901 if !cf.isSinkless() { 902 description = fmt.Sprintf("job %d", cf.spec.JobID) 903 } 904 if frontierChanged { 905 log.Infof(cf.Ctx, "%s new resolved timestamp %s is behind by %s", 906 description, frontier, resolvedBehind) 907 } 908 const slowSpanMaxFrequency = 10 * time.Second 909 if now.Sub(cf.lastSlowSpanLog) > slowSpanMaxFrequency { 910 cf.lastSlowSpanLog = now 911 s := cf.sf.PeekFrontierSpan() 912 log.Infof(cf.Ctx, "%s span %s is behind by %s", description, s, resolvedBehind) 913 } 914 return true 915 } 916 917 // ConsumerDone is part of the RowSource interface. 918 func (cf *changeFrontier) ConsumerDone() { 919 cf.MoveToDraining(nil /* err */) 920 } 921 922 // ConsumerClosed is part of the RowSource interface. 923 func (cf *changeFrontier) ConsumerClosed() { 924 // The consumer is done, Next() will not be called again. 925 cf.InternalClose() 926 } 927 928 // isSinkless returns true if this changeFrontier is sinkless and thus does not 929 // have a job. 930 func (cf *changeFrontier) isSinkless() bool { 931 return cf.spec.JobID == 0 932 }