github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/rangefeed/registry.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package rangefeed 12 13 import ( 14 "bytes" 15 "context" 16 "fmt" 17 "sync" 18 "time" 19 20 "github.com/cockroachdb/cockroach/pkg/roachpb" 21 "github.com/cockroachdb/cockroach/pkg/storage" 22 "github.com/cockroachdb/cockroach/pkg/storage/enginepb" 23 "github.com/cockroachdb/cockroach/pkg/util/bufalloc" 24 "github.com/cockroachdb/cockroach/pkg/util/hlc" 25 "github.com/cockroachdb/cockroach/pkg/util/interval" 26 "github.com/cockroachdb/cockroach/pkg/util/log" 27 "github.com/cockroachdb/cockroach/pkg/util/protoutil" 28 "github.com/cockroachdb/cockroach/pkg/util/retry" 29 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 30 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 31 "github.com/cockroachdb/errors" 32 ) 33 34 // Stream is a object capable of transmitting RangeFeedEvents. 35 type Stream interface { 36 // Context returns the context for this stream. 37 Context() context.Context 38 // Send blocks until it sends m, the stream is done, or the stream breaks. 39 // Send must be safe to call on the same stream in different goroutines. 40 Send(*roachpb.RangeFeedEvent) error 41 } 42 43 // registration is an instance of a rangefeed subscriber who has 44 // registered to receive updates for a specific range of keys. 45 // Updates are delivered to its stream until one of the following 46 // conditions is met: 47 // 1. a Send to the Stream returns an error 48 // 2. the Stream's context is canceled 49 // 3. the registration is manually unregistered 50 // 51 // In all cases, when a registration is unregistered its error 52 // channel is sent an error to inform it that the registration 53 // has finished. 54 type registration struct { 55 // Input. 56 span roachpb.Span 57 catchupTimestamp hlc.Timestamp 58 catchupIter storage.SimpleIterator 59 withDiff bool 60 metrics *Metrics 61 62 // Output. 63 stream Stream 64 errC chan<- *roachpb.Error 65 66 // Internal. 67 id int64 68 keys interval.Range 69 buf chan *roachpb.RangeFeedEvent 70 71 mu struct { 72 sync.Locker 73 // True if this registration buffer has overflowed, dropping a live event. 74 // This will cause the registration to exit with an error once the buffer 75 // has been emptied. 76 overflowed bool 77 // Boolean indicating if all events have been output to stream. Used only 78 // for testing. 79 caughtUp bool 80 // Management of the output loop goroutine, used to ensure proper teardown. 81 outputLoopCancelFn func() 82 disconnected bool 83 } 84 } 85 86 func newRegistration( 87 span roachpb.Span, 88 startTS hlc.Timestamp, 89 catchupIter storage.SimpleIterator, 90 withDiff bool, 91 bufferSz int, 92 metrics *Metrics, 93 stream Stream, 94 errC chan<- *roachpb.Error, 95 ) registration { 96 r := registration{ 97 span: span, 98 catchupTimestamp: startTS, 99 catchupIter: catchupIter, 100 withDiff: withDiff, 101 metrics: metrics, 102 stream: stream, 103 errC: errC, 104 buf: make(chan *roachpb.RangeFeedEvent, bufferSz), 105 } 106 r.mu.Locker = &syncutil.Mutex{} 107 r.mu.caughtUp = true 108 return r 109 } 110 111 // publish attempts to send a single event to the output buffer for this 112 // registration. If the output buffer is full, the overflowed flag is set, 113 // indicating that live events were lost and a catchup scan should be initiated. 114 // If overflowed is already set, events are ignored and not written to the 115 // buffer. 116 func (r *registration) publish(event *roachpb.RangeFeedEvent) { 117 r.validateEvent(event) 118 event = r.maybeStripEvent(event) 119 120 r.mu.Lock() 121 defer r.mu.Unlock() 122 if r.mu.overflowed { 123 return 124 } 125 select { 126 case r.buf <- event: 127 r.mu.caughtUp = false 128 default: 129 // Buffer exceeded and we are dropping this event. Registration will need 130 // a catch-up scan. 131 r.mu.overflowed = true 132 } 133 } 134 135 // validateEvent checks that the event contains enough information for the 136 // registation. 137 func (r *registration) validateEvent(event *roachpb.RangeFeedEvent) { 138 switch t := event.GetValue().(type) { 139 case *roachpb.RangeFeedValue: 140 if t.Key == nil { 141 panic(fmt.Sprintf("unexpected empty RangeFeedValue.Key: %v", t)) 142 } 143 if t.Value.RawBytes == nil { 144 panic(fmt.Sprintf("unexpected empty RangeFeedValue.Value.RawBytes: %v", t)) 145 } 146 if t.Value.Timestamp.IsEmpty() { 147 panic(fmt.Sprintf("unexpected empty RangeFeedValue.Value.Timestamp: %v", t)) 148 } 149 case *roachpb.RangeFeedCheckpoint: 150 if t.Span.Key == nil { 151 panic(fmt.Sprintf("unexpected empty RangeFeedCheckpoint.Span.Key: %v", t)) 152 } 153 default: 154 panic(fmt.Sprintf("unexpected RangeFeedEvent variant: %v", t)) 155 } 156 } 157 158 // maybeStripEvent determines whether the event contains excess information not 159 // applicable to the current registration. If so, it makes a copy of the event 160 // and strips the incompatible information to match only what the registration 161 // requested. 162 func (r *registration) maybeStripEvent(event *roachpb.RangeFeedEvent) *roachpb.RangeFeedEvent { 163 ret := event 164 copyOnWrite := func() interface{} { 165 if ret == event { 166 ret = event.ShallowCopy() 167 } 168 return ret.GetValue() 169 } 170 171 switch t := ret.GetValue().(type) { 172 case *roachpb.RangeFeedValue: 173 if t.PrevValue.IsPresent() && !r.withDiff { 174 // If no registrations for the current Range are requesting previous 175 // values, then we won't even retrieve them on the Raft goroutine. 176 // However, if any are and they overlap with an update then the 177 // previous value on the corresponding events will be populated. 178 // If we're in this case and any other registrations don't want 179 // previous values then we'll need to strip them. 180 t = copyOnWrite().(*roachpb.RangeFeedValue) 181 t.PrevValue = roachpb.Value{} 182 } 183 case *roachpb.RangeFeedCheckpoint: 184 if !t.Span.EqualValue(r.span) { 185 // Checkpoint events are always created spanning the entire Range. 186 // However, a registration might not be listening on updates over 187 // the entire Range. If this is the case then we need to constrain 188 // the checkpoint events published to that registration to just the 189 // span that it's listening on. This is more than just a convenience 190 // to consumers - it would be incorrect to say that a rangefeed has 191 // observed all values up to the checkpoint timestamp over a given 192 // key span if any updates to that span have been filtered out. 193 if !t.Span.Contains(r.span) { 194 panic(fmt.Sprintf("registration span %v larger than checkpoint span %v", r.span, t.Span)) 195 } 196 t = copyOnWrite().(*roachpb.RangeFeedCheckpoint) 197 t.Span = r.span 198 } 199 default: 200 panic(fmt.Sprintf("unexpected RangeFeedEvent variant: %v", t)) 201 } 202 return ret 203 } 204 205 // disconnect cancels the output loop context for the registration and passes an 206 // error to the output error stream for the registration. This also sets the 207 // disconnected flag on the registration, preventing it from being disconnected 208 // again. 209 func (r *registration) disconnect(pErr *roachpb.Error) { 210 r.mu.Lock() 211 defer r.mu.Unlock() 212 if !r.mu.disconnected { 213 if r.mu.outputLoopCancelFn != nil { 214 r.mu.outputLoopCancelFn() 215 } 216 r.mu.disconnected = true 217 r.errC <- pErr 218 } 219 } 220 221 // outputLoop is the operational loop for a single registration. The behavior 222 // is as thus: 223 // 224 // 1. If a catch-up scan is indicated, run one before beginning the proper 225 // output loop. 226 // 2. After catch-up is complete, begin reading from the registration buffer 227 // channel and writing to the output stream until the buffer is empty *and* 228 // the overflow flag has been set. 229 // 230 // The loop exits with any error encountered, if the provided context is 231 // canceled, or when the buffer has overflowed and all pre-overflow entries 232 // have been emitted. 233 func (r *registration) outputLoop(ctx context.Context) error { 234 // If the registration has a catch-up scan, 235 if r.catchupIter != nil { 236 if err := r.runCatchupScan(); err != nil { 237 err = errors.Wrap(err, "catch-up scan failed") 238 log.Errorf(ctx, "%v", err) 239 return err 240 } 241 } 242 243 // Normal buffered output loop. 244 for { 245 overflowed := false 246 r.mu.Lock() 247 if len(r.buf) == 0 { 248 overflowed = r.mu.overflowed 249 r.mu.caughtUp = true 250 } 251 r.mu.Unlock() 252 if overflowed { 253 return newErrBufferCapacityExceeded().GoError() 254 } 255 256 select { 257 case nextEvent := <-r.buf: 258 if err := r.stream.Send(nextEvent); err != nil { 259 return err 260 } 261 case <-ctx.Done(): 262 return ctx.Err() 263 case <-r.stream.Context().Done(): 264 return r.stream.Context().Err() 265 } 266 } 267 } 268 269 func (r *registration) runOutputLoop(ctx context.Context) { 270 r.mu.Lock() 271 ctx, r.mu.outputLoopCancelFn = context.WithCancel(ctx) 272 r.mu.Unlock() 273 err := r.outputLoop(ctx) 274 r.disconnect(roachpb.NewError(err)) 275 } 276 277 // runCatchupScan starts a catchup scan which will output entries for all 278 // recorded changes in the replica that are newer than the catchupTimestamp. 279 // This uses the iterator provided when the registration was originally created; 280 // after the scan completes, the iterator will be closed. 281 func (r *registration) runCatchupScan() error { 282 if r.catchupIter == nil { 283 return nil 284 } 285 start := timeutil.Now() 286 defer func() { 287 r.catchupIter.Close() 288 r.catchupIter = nil 289 r.metrics.RangeFeedCatchupScanNanos.Inc(timeutil.Since(start).Nanoseconds()) 290 }() 291 292 var a bufalloc.ByteAllocator 293 startKey := storage.MakeMVCCMetadataKey(r.span.Key) 294 endKey := storage.MakeMVCCMetadataKey(r.span.EndKey) 295 296 // Iterator will encounter historical values for each key in 297 // reverse-chronological order. To output in chronological order, store 298 // events for the same key until a different key is encountered, then output 299 // the encountered values in reverse. This also allows us to buffer events 300 // as we fill in previous values. 301 var lastKey roachpb.Key 302 reorderBuf := make([]roachpb.RangeFeedEvent, 0, 5) 303 addPrevToLastEvent := func(val []byte) { 304 if l := len(reorderBuf); l > 0 { 305 if reorderBuf[l-1].Val.PrevValue.IsPresent() { 306 panic("RangeFeedValue.PrevVal unexpectedly set") 307 } 308 reorderBuf[l-1].Val.PrevValue.RawBytes = val 309 } 310 } 311 outputEvents := func() error { 312 for i := len(reorderBuf) - 1; i >= 0; i-- { 313 e := reorderBuf[i] 314 if err := r.stream.Send(&e); err != nil { 315 return err 316 } 317 } 318 reorderBuf = reorderBuf[:0] 319 return nil 320 } 321 322 // Iterate though all keys using Next. We want to publish all committed 323 // versions of each key that are after the registration's startTS, so we 324 // can't use NextKey. 325 var meta enginepb.MVCCMetadata 326 r.catchupIter.SeekGE(startKey) 327 for { 328 if ok, err := r.catchupIter.Valid(); err != nil { 329 return err 330 } else if !ok || !r.catchupIter.UnsafeKey().Less(endKey) { 331 break 332 } 333 334 unsafeKey := r.catchupIter.UnsafeKey() 335 unsafeVal := r.catchupIter.UnsafeValue() 336 if !unsafeKey.IsValue() { 337 // Found a metadata key. 338 if err := protoutil.Unmarshal(unsafeVal, &meta); err != nil { 339 return errors.Wrapf(err, "unmarshaling mvcc meta: %v", unsafeKey) 340 } 341 if !meta.IsInline() { 342 // This is an MVCCMetadata key for an intent. The catchup scan 343 // only cares about committed values, so ignore this and skip 344 // past the corresponding provisional key-value. To do this, 345 // scan to the timestamp immediately before (i.e. the key 346 // immediately after) the provisional key. 347 r.catchupIter.SeekGE(storage.MVCCKey{ 348 Key: unsafeKey.Key, 349 Timestamp: hlc.Timestamp(meta.Timestamp).Prev(), 350 }) 351 continue 352 } 353 354 // If write is inline, it doesn't have a timestamp so we don't 355 // filter on the registration's starting timestamp. Instead, we 356 // return all inline writes. 357 unsafeVal = meta.RawBytes 358 } 359 360 // Determine whether the iterator moved to a new key. 361 sameKey := bytes.Equal(unsafeKey.Key, lastKey) 362 if !sameKey { 363 // If so, output events for the last key encountered. 364 if err := outputEvents(); err != nil { 365 return err 366 } 367 a, lastKey = a.Copy(unsafeKey.Key, 0) 368 } 369 key := lastKey 370 ts := unsafeKey.Timestamp 371 372 // Ignore the version if it's not inline and its timestamp is at 373 // or before the registration's (exclusive) starting timestamp. 374 ignore := !(ts.IsEmpty() || r.catchupTimestamp.Less(ts)) 375 if ignore && !r.withDiff { 376 // Skip all the way to the next key. 377 // NB: fast-path to avoid value copy when !r.withDiff. 378 r.catchupIter.NextKey() 379 continue 380 } 381 382 var val []byte 383 a, val = a.Copy(unsafeVal, 0) 384 if r.withDiff { 385 // Update the last version with its previous value (this version). 386 addPrevToLastEvent(val) 387 } 388 389 if ignore { 390 // Skip all the way to the next key. 391 r.catchupIter.NextKey() 392 } else { 393 // Move to the next version of this key. 394 r.catchupIter.Next() 395 396 var event roachpb.RangeFeedEvent 397 event.MustSetValue(&roachpb.RangeFeedValue{ 398 Key: key, 399 Value: roachpb.Value{ 400 RawBytes: val, 401 Timestamp: ts, 402 }, 403 }) 404 reorderBuf = append(reorderBuf, event) 405 } 406 } 407 408 // Output events for the last key encountered. 409 return outputEvents() 410 } 411 412 // ID implements interval.Interface. 413 func (r *registration) ID() uintptr { 414 return uintptr(r.id) 415 } 416 417 // Range implements interval.Interface. 418 func (r *registration) Range() interval.Range { 419 return r.keys 420 } 421 422 func (r registration) String() string { 423 return fmt.Sprintf("[%s @ %s+]", r.span, r.catchupTimestamp) 424 } 425 426 // registry holds a set of registrations and manages their lifecycle. 427 type registry struct { 428 tree interval.Tree // *registration items 429 idAlloc int64 430 } 431 432 func makeRegistry() registry { 433 return registry{ 434 tree: interval.NewTree(interval.ExclusiveOverlapper), 435 } 436 } 437 438 // Len returns the number of registrations in the registry. 439 func (reg *registry) Len() int { 440 return reg.tree.Len() 441 } 442 443 // NewFilter returns a operation filter reflecting the registrations 444 // in the registry. 445 func (reg *registry) NewFilter() *Filter { 446 return newFilterFromRegistry(reg) 447 } 448 449 // Register adds the provided registration to the registry. 450 func (reg *registry) Register(r *registration) { 451 r.id = reg.nextID() 452 r.keys = r.span.AsRange() 453 if err := reg.tree.Insert(r, false /* fast */); err != nil { 454 panic(err) 455 } 456 } 457 458 func (reg *registry) nextID() int64 { 459 reg.idAlloc++ 460 return reg.idAlloc 461 } 462 463 // PublishToOverlapping publishes the provided event to all registrations whose 464 // range overlaps the specified span. 465 func (reg *registry) PublishToOverlapping(span roachpb.Span, event *roachpb.RangeFeedEvent) { 466 // Determine the earliest starting timestamp that a registration 467 // can have while still needing to hear about this event. 468 var minTS hlc.Timestamp 469 switch t := event.GetValue().(type) { 470 case *roachpb.RangeFeedValue: 471 // Only publish values to registrations with starting 472 // timestamps equal to or greater than the value's timestamp. 473 minTS = t.Value.Timestamp 474 case *roachpb.RangeFeedCheckpoint: 475 // Always publish checkpoint notifications, regardless of a registration's 476 // starting timestamp. 477 // 478 // TODO(dan): It's unclear if this is the right contract, it's certainly 479 // surprising. Revisit this once RangeFeed has more users. 480 minTS = hlc.MaxTimestamp 481 default: 482 panic(fmt.Sprintf("unexpected RangeFeedEvent variant: %v", t)) 483 } 484 485 reg.forOverlappingRegs(span, func(r *registration) (bool, *roachpb.Error) { 486 // Don't publish events if they are equal to or less 487 // than the registration's starting timestamp. 488 if r.catchupTimestamp.Less(minTS) { 489 r.publish(event) 490 } 491 return false, nil 492 }) 493 } 494 495 // Unregister removes a registration from the registry. It is assumed that the 496 // registration has already been disconnected, this is intended only to clean 497 // up the registry. 498 func (reg *registry) Unregister(r *registration) { 499 if err := reg.tree.Delete(r, false /* fast */); err != nil { 500 panic(err) 501 } 502 } 503 504 // Disconnect disconnects all registrations that overlap the specified span with 505 // a nil error. 506 func (reg *registry) Disconnect(span roachpb.Span) { 507 reg.DisconnectWithErr(span, nil /* pErr */) 508 } 509 510 // DisconnectWithErr disconnects all registrations that overlap the specified 511 // span with the provided error. 512 func (reg *registry) DisconnectWithErr(span roachpb.Span, pErr *roachpb.Error) { 513 reg.forOverlappingRegs(span, func(_ *registration) (bool, *roachpb.Error) { 514 return true, pErr 515 }) 516 } 517 518 // all is a span that overlaps with all registrations. 519 var all = roachpb.Span{Key: roachpb.KeyMin, EndKey: roachpb.KeyMax} 520 521 // forOverlappingRegs calls the provided function on each registration that 522 // overlaps the span. If the function returns true for a given registration 523 // then that registration is unregistered and the error returned by the 524 // function is send on its corresponding error channel. 525 func (reg *registry) forOverlappingRegs( 526 span roachpb.Span, fn func(*registration) (disconnect bool, pErr *roachpb.Error), 527 ) { 528 var toDelete []interval.Interface 529 matchFn := func(i interval.Interface) (done bool) { 530 r := i.(*registration) 531 dis, pErr := fn(r) 532 if dis { 533 r.disconnect(pErr) 534 toDelete = append(toDelete, i) 535 } 536 return false 537 } 538 if span.EqualValue(all) { 539 reg.tree.Do(matchFn) 540 } else { 541 reg.tree.DoMatching(matchFn, span.AsRange()) 542 } 543 544 if len(toDelete) == reg.tree.Len() { 545 reg.tree.Clear() 546 } else if len(toDelete) == 1 { 547 if err := reg.tree.Delete(toDelete[0], false /* fast */); err != nil { 548 panic(err) 549 } 550 } else if len(toDelete) > 1 { 551 for _, i := range toDelete { 552 if err := reg.tree.Delete(i, true /* fast */); err != nil { 553 panic(err) 554 } 555 } 556 reg.tree.AdjustRanges() 557 } 558 } 559 560 // Wait for this registration to completely process its internal buffer. 561 func (r *registration) waitForCaughtUp() error { 562 opts := retry.Options{ 563 InitialBackoff: 5 * time.Millisecond, 564 Multiplier: 2, 565 MaxBackoff: 10 * time.Second, 566 MaxRetries: 50, 567 } 568 for re := retry.Start(opts); re.Next(); { 569 r.mu.Lock() 570 caughtUp := len(r.buf) == 0 && r.mu.caughtUp 571 r.mu.Unlock() 572 if caughtUp { 573 return nil 574 } 575 } 576 return errors.Errorf("registration %v failed to empty in time", r.Range()) 577 } 578 579 // waitForCaughtUp waits for all registrations overlapping the given span to 580 // completely process their internal buffers. 581 func (reg *registry) waitForCaughtUp(span roachpb.Span) error { 582 var outerErr error 583 reg.forOverlappingRegs(span, func(r *registration) (bool, *roachpb.Error) { 584 if outerErr == nil { 585 outerErr = r.waitForCaughtUp() 586 } 587 return false, nil 588 }) 589 return outerErr 590 }