github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/ccl/changefeedccl/changefeed.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Licensed as a CockroachDB Enterprise file under the Cockroach Community 4 // License (the "License"); you may not use this file except in compliance with 5 // the License. You may obtain a copy of the License at 6 // 7 // https://github.com/cockroachdb/cockroach/blob/master/licenses/CCL.txt 8 9 package changefeedccl 10 11 import ( 12 "context" 13 "time" 14 15 "github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/changefeedbase" 16 "github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/kvfeed" 17 "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" 18 "github.com/cockroachdb/cockroach/pkg/jobs/jobsprotectedts" 19 "github.com/cockroachdb/cockroach/pkg/keys" 20 "github.com/cockroachdb/cockroach/pkg/kv" 21 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/protectedts" 22 "github.com/cockroachdb/cockroach/pkg/roachpb" 23 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 24 "github.com/cockroachdb/cockroach/pkg/sql/catalog/lease" 25 "github.com/cockroachdb/cockroach/pkg/sql/row" 26 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 27 "github.com/cockroachdb/cockroach/pkg/util/bufalloc" 28 "github.com/cockroachdb/cockroach/pkg/util/hlc" 29 "github.com/cockroachdb/cockroach/pkg/util/log" 30 "github.com/cockroachdb/cockroach/pkg/util/span" 31 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 32 "github.com/cockroachdb/cockroach/pkg/util/uuid" 33 "github.com/cockroachdb/errors" 34 ) 35 36 const ( 37 jsonMetaSentinel = `__crdb__` 38 ) 39 40 type emitEntry struct { 41 // row, if not the zero value, represents a changed row to be emitted. 42 row encodeRow 43 44 // resolved, if non-nil, is a guarantee for the associated 45 // span that no previously unseen entries with a lower or equal updated 46 // timestamp will be emitted. 47 resolved *jobspb.ResolvedSpan 48 49 // bufferGetTimestamp is the time this entry came out of the buffer. 50 bufferGetTimestamp time.Time 51 } 52 53 // kvsToRows gets changed kvs from a closure and converts them into sql rows. It 54 // returns a closure that may be repeatedly called to advance the changefeed. 55 // The returned closure is not threadsafe. 56 func kvsToRows( 57 codec keys.SQLCodec, 58 leaseMgr *lease.Manager, 59 details jobspb.ChangefeedDetails, 60 inputFn func(context.Context) (kvfeed.Event, error), 61 ) func(context.Context) ([]emitEntry, error) { 62 _, withDiff := details.Opts[changefeedbase.OptDiff] 63 rfCache := newRowFetcherCache(codec, leaseMgr) 64 65 var kvs row.SpanKVFetcher 66 appendEmitEntryForKV := func( 67 ctx context.Context, 68 output []emitEntry, 69 kv roachpb.KeyValue, 70 prevVal roachpb.Value, 71 schemaTimestamp hlc.Timestamp, 72 prevSchemaTimestamp hlc.Timestamp, 73 bufferGetTimestamp time.Time, 74 ) ([]emitEntry, error) { 75 76 desc, err := rfCache.TableDescForKey(ctx, kv.Key, schemaTimestamp) 77 if err != nil { 78 return nil, err 79 } 80 if _, ok := details.Targets[desc.ID]; !ok { 81 // This kv is for an interleaved table that we're not watching. 82 if log.V(3) { 83 log.Infof(ctx, `skipping key from unwatched table %s: %s`, desc.Name, kv.Key) 84 } 85 return nil, nil 86 } 87 88 rf, err := rfCache.RowFetcherForTableDesc(desc) 89 if err != nil { 90 return nil, err 91 } 92 93 // Get new value. 94 var r emitEntry 95 r.bufferGetTimestamp = bufferGetTimestamp 96 { 97 // TODO(dan): Handle tables with multiple column families. 98 // Reuse kvs to save allocations. 99 kvs.KVs = kvs.KVs[:0] 100 kvs.KVs = append(kvs.KVs, kv) 101 if err := rf.StartScanFrom(ctx, &kvs); err != nil { 102 return nil, err 103 } 104 105 r.row.datums, r.row.tableDesc, _, err = rf.NextRow(ctx) 106 if err != nil { 107 return nil, err 108 } 109 if r.row.datums == nil { 110 return nil, errors.AssertionFailedf("unexpected empty datums") 111 } 112 r.row.datums = append(sqlbase.EncDatumRow(nil), r.row.datums...) 113 r.row.deleted = rf.RowIsDeleted() 114 r.row.updated = schemaTimestamp 115 116 // Assert that we don't get a second row from the row.Fetcher. We 117 // fed it a single KV, so that would be surprising. 118 var nextRow emitEntry 119 nextRow.row.datums, nextRow.row.tableDesc, _, err = rf.NextRow(ctx) 120 if err != nil { 121 return nil, err 122 } 123 if nextRow.row.datums != nil { 124 return nil, errors.AssertionFailedf("unexpected non-empty datums") 125 } 126 } 127 128 // Get prev value, if necessary. 129 if withDiff { 130 prevRF := rf 131 if prevSchemaTimestamp != schemaTimestamp { 132 // If the previous value is being interpreted under a different 133 // version of the schema, fetch the correct table descriptor and 134 // create a new row.Fetcher with it. 135 prevDesc, err := rfCache.TableDescForKey(ctx, kv.Key, prevSchemaTimestamp) 136 if err != nil { 137 return nil, err 138 } 139 140 prevRF, err = rfCache.RowFetcherForTableDesc(prevDesc) 141 if err != nil { 142 return nil, err 143 } 144 } 145 146 prevKV := roachpb.KeyValue{Key: kv.Key, Value: prevVal} 147 // TODO(dan): Handle tables with multiple column families. 148 // Reuse kvs to save allocations. 149 kvs.KVs = kvs.KVs[:0] 150 kvs.KVs = append(kvs.KVs, prevKV) 151 if err := prevRF.StartScanFrom(ctx, &kvs); err != nil { 152 return nil, err 153 } 154 r.row.prevDatums, r.row.prevTableDesc, _, err = prevRF.NextRow(ctx) 155 if err != nil { 156 return nil, err 157 } 158 if r.row.prevDatums == nil { 159 return nil, errors.AssertionFailedf("unexpected empty datums") 160 } 161 r.row.prevDatums = append(sqlbase.EncDatumRow(nil), r.row.prevDatums...) 162 r.row.prevDeleted = prevRF.RowIsDeleted() 163 164 // Assert that we don't get a second row from the row.Fetcher. We 165 // fed it a single KV, so that would be surprising. 166 var nextRow emitEntry 167 nextRow.row.prevDatums, nextRow.row.prevTableDesc, _, err = prevRF.NextRow(ctx) 168 if err != nil { 169 return nil, err 170 } 171 if nextRow.row.prevDatums != nil { 172 return nil, errors.AssertionFailedf("unexpected non-empty datums") 173 } 174 } 175 176 output = append(output, r) 177 return output, nil 178 } 179 180 var output []emitEntry 181 return func(ctx context.Context) ([]emitEntry, error) { 182 // Reuse output to save allocations. 183 output = output[:0] 184 for { 185 input, err := inputFn(ctx) 186 if err != nil { 187 return nil, err 188 } 189 switch input.Type() { 190 case kvfeed.KVEvent: 191 kv := input.KV() 192 if log.V(3) { 193 log.Infof(ctx, "changed key %s %s", kv.Key, kv.Value.Timestamp) 194 } 195 schemaTimestamp := kv.Value.Timestamp 196 prevSchemaTimestamp := schemaTimestamp 197 if backfillTs := input.BackfillTimestamp(); backfillTs != (hlc.Timestamp{}) { 198 schemaTimestamp = backfillTs 199 prevSchemaTimestamp = schemaTimestamp.Prev() 200 } 201 output, err = appendEmitEntryForKV( 202 ctx, output, kv, input.PrevValue(), 203 schemaTimestamp, prevSchemaTimestamp, 204 input.BufferGetTimestamp()) 205 if err != nil { 206 return nil, err 207 } 208 case kvfeed.ResolvedEvent: 209 output = append(output, emitEntry{ 210 resolved: input.Resolved(), 211 bufferGetTimestamp: input.BufferGetTimestamp(), 212 }) 213 } 214 if output != nil { 215 return output, nil 216 } 217 } 218 } 219 } 220 221 // emitEntries connects to a sink, receives rows from a closure, and repeatedly 222 // emits them to the sink. It returns a closure that may be repeatedly called to 223 // advance the changefeed and which returns span-level resolved timestamp 224 // updates. The returned closure is not threadsafe. Note that rows read from 225 // `inputFn` which precede or equal the Frontier of `sf` will not be emitted 226 // because they're provably duplicates. 227 func emitEntries( 228 settings *cluster.Settings, 229 details jobspb.ChangefeedDetails, 230 cursor hlc.Timestamp, 231 sf *span.Frontier, 232 encoder Encoder, 233 sink Sink, 234 inputFn func(context.Context) ([]emitEntry, error), 235 knobs TestingKnobs, 236 metrics *Metrics, 237 ) func(context.Context) ([]jobspb.ResolvedSpan, error) { 238 var scratch bufalloc.ByteAllocator 239 emitRowFn := func(ctx context.Context, row encodeRow) error { 240 // Ensure that row updates are strictly newer than the least resolved timestamp 241 // being tracked by the local span frontier. The poller should not be forwarding 242 // row updates that have timestamps less than or equal to any resolved timestamp 243 // it's forwarded before. 244 // TODO(dan): This should be an assertion once we're confident this can never 245 // happen under any circumstance. 246 if row.updated.LessEq(sf.Frontier()) && !row.updated.Equal(cursor) { 247 log.Errorf(ctx, "cdc ux violation: detected timestamp %s that is less than "+ 248 "or equal to the local frontier %s.", cloudStorageFormatTime(row.updated), 249 cloudStorageFormatTime(sf.Frontier())) 250 return nil 251 } 252 var keyCopy, valueCopy []byte 253 encodedKey, err := encoder.EncodeKey(ctx, row) 254 if err != nil { 255 return err 256 } 257 scratch, keyCopy = scratch.Copy(encodedKey, 0 /* extraCap */) 258 encodedValue, err := encoder.EncodeValue(ctx, row) 259 if err != nil { 260 return err 261 } 262 scratch, valueCopy = scratch.Copy(encodedValue, 0 /* extraCap */) 263 264 if knobs.BeforeEmitRow != nil { 265 if err := knobs.BeforeEmitRow(ctx); err != nil { 266 return err 267 } 268 } 269 if err := sink.EmitRow( 270 ctx, row.tableDesc, keyCopy, valueCopy, row.updated, 271 ); err != nil { 272 return err 273 } 274 if log.V(3) { 275 log.Infof(ctx, `row %s: %s -> %s`, row.tableDesc.Name, keyCopy, valueCopy) 276 } 277 return nil 278 } 279 280 var lastFlush time.Time 281 // TODO(dan): We could keep these in `sf` to eliminate dups. 282 var resolvedSpans []jobspb.ResolvedSpan 283 284 return func(ctx context.Context) ([]jobspb.ResolvedSpan, error) { 285 inputs, err := inputFn(ctx) 286 if err != nil { 287 return nil, err 288 } 289 var boundaryReached bool 290 for _, input := range inputs { 291 if input.bufferGetTimestamp == (time.Time{}) { 292 // We could gracefully handle this instead of panic'ing, but 293 // we'd really like to be able to reason about this data, so 294 // instead we're defensive. If this is ever seen in prod without 295 // breaking a unit test, then we have a pretty severe test 296 // coverage issue. 297 panic(`unreachable: bufferGetTimestamp is set by all codepaths`) 298 } 299 processingNanos := timeutil.Since(input.bufferGetTimestamp).Nanoseconds() 300 metrics.ProcessingNanos.Inc(processingNanos) 301 302 if input.row.datums != nil { 303 if err := emitRowFn(ctx, input.row); err != nil { 304 return nil, err 305 } 306 } 307 if input.resolved != nil { 308 boundaryReached = boundaryReached || input.resolved.BoundaryReached 309 _ = sf.Forward(input.resolved.Span, input.resolved.Timestamp) 310 resolvedSpans = append(resolvedSpans, *input.resolved) 311 } 312 } 313 314 // If the resolved timestamp frequency is specified, use it as a rough 315 // approximation of how latency-sensitive the changefeed user is. If 316 // it's not, fall back to the poll interval. 317 // 318 // The current poller implementation means we emit a changefeed-level 319 // resolved timestamps to the user once per changefeedPollInterval. This 320 // buffering adds on average timeBetweenFlushes/2 to that latency. With 321 // timeBetweenFlushes and changefeedPollInterval both set to 1s, TPCC 322 // was seeing about 100x more time spent emitting than flushing. 323 // Dividing by 5 tries to balance these a bit, but ultimately is fairly 324 // unprincipled. 325 // 326 // NB: As long as we periodically get new span-level resolved timestamps 327 // from the poller (which should always happen, even if the watched data 328 // is not changing), then this is sufficient and we don't have to do 329 // anything fancy with timers. 330 var timeBetweenFlushes time.Duration 331 if r, ok := details.Opts[changefeedbase.OptResolvedTimestamps]; ok && r != `` { 332 var err error 333 if timeBetweenFlushes, err = time.ParseDuration(r); err != nil { 334 return nil, err 335 } 336 } else { 337 timeBetweenFlushes = changefeedbase.TableDescriptorPollInterval.Get(&settings.SV) / 5 338 } 339 if len(resolvedSpans) == 0 || 340 (timeutil.Since(lastFlush) < timeBetweenFlushes && !boundaryReached) { 341 return nil, nil 342 } 343 344 // Make sure to flush the sink before forwarding resolved spans, 345 // otherwise, we could lose buffered messages and violate the 346 // at-least-once guarantee. This is also true for checkpointing the 347 // resolved spans in the job progress. 348 if err := sink.Flush(ctx); err != nil { 349 return nil, err 350 } 351 lastFlush = timeutil.Now() 352 if knobs.AfterSinkFlush != nil { 353 if err := knobs.AfterSinkFlush(); err != nil { 354 return nil, err 355 } 356 } 357 ret := append([]jobspb.ResolvedSpan(nil), resolvedSpans...) 358 resolvedSpans = resolvedSpans[:0] 359 return ret, nil 360 } 361 } 362 363 // emitResolvedTimestamp emits a changefeed-level resolved timestamp to the 364 // sink. 365 func emitResolvedTimestamp( 366 ctx context.Context, encoder Encoder, sink Sink, resolved hlc.Timestamp, 367 ) error { 368 // TODO(dan): Emit more fine-grained (table level) resolved 369 // timestamps. 370 if err := sink.EmitResolvedTimestamp(ctx, encoder, resolved); err != nil { 371 return err 372 } 373 if log.V(2) { 374 log.Infof(ctx, `resolved %s`, resolved) 375 } 376 return nil 377 } 378 379 // createProtectedTimestampRecord will create a record to protect the spans for 380 // this changefeed at the resolved timestamp. The progress struct will be 381 // updated to refer to this new protected timestamp record. 382 func createProtectedTimestampRecord( 383 ctx context.Context, 384 pts protectedts.Storage, 385 txn *kv.Txn, 386 jobID int64, 387 targets jobspb.ChangefeedTargets, 388 resolved hlc.Timestamp, 389 progress *jobspb.ChangefeedProgress, 390 ) error { 391 progress.ProtectedTimestampRecord = uuid.MakeV4() 392 log.VEventf(ctx, 2, "creating protected timestamp %v at %v", 393 progress.ProtectedTimestampRecord, resolved) 394 spansToProtect := makeSpansToProtect(targets) 395 rec := jobsprotectedts.MakeRecord( 396 progress.ProtectedTimestampRecord, jobID, resolved, spansToProtect) 397 return pts.Protect(ctx, txn, rec) 398 } 399 400 func makeSpansToProtect(targets jobspb.ChangefeedTargets) []roachpb.Span { 401 // NB: We add 1 because we're also going to protect system.descriptors. 402 // We protect system.descriptors because a changefeed needs all of the history 403 // of table descriptors to version data. 404 spansToProtect := make([]roachpb.Span, 0, len(targets)+1) 405 addTablePrefix := func(id uint32) { 406 tablePrefix := keys.TODOSQLCodec.TablePrefix(id) 407 spansToProtect = append(spansToProtect, roachpb.Span{ 408 Key: tablePrefix, 409 EndKey: tablePrefix.PrefixEnd(), 410 }) 411 } 412 for t := range targets { 413 addTablePrefix(uint32(t)) 414 } 415 addTablePrefix(keys.DescriptorTableID) 416 return spansToProtect 417 } 418 419 // initialScanFromOptions returns whether or not the options indicate the need 420 // for an initial scan on the first run. 421 func initialScanFromOptions(opts map[string]string) bool { 422 _, cursor := opts[changefeedbase.OptCursor] 423 _, initialScan := opts[changefeedbase.OptInitialScan] 424 _, noInitialScan := opts[changefeedbase.OptNoInitialScan] 425 return (cursor && initialScan) || (!cursor && !noInitialScan) 426 }