github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/store_snapshot.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver 12 13 import ( 14 "context" 15 "fmt" 16 "io" 17 "time" 18 19 "github.com/cockroachdb/cockroach/pkg/base" 20 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/rditer" 21 "github.com/cockroachdb/cockroach/pkg/roachpb" 22 "github.com/cockroachdb/cockroach/pkg/settings" 23 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 24 "github.com/cockroachdb/cockroach/pkg/storage" 25 "github.com/cockroachdb/cockroach/pkg/util/envutil" 26 "github.com/cockroachdb/cockroach/pkg/util/humanizeutil" 27 "github.com/cockroachdb/cockroach/pkg/util/log" 28 "github.com/cockroachdb/cockroach/pkg/util/protoutil" 29 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 30 "github.com/cockroachdb/cockroach/pkg/util/uuid" 31 "github.com/cockroachdb/errors" 32 crdberrors "github.com/cockroachdb/errors" 33 "go.etcd.io/etcd/raft/raftpb" 34 "golang.org/x/time/rate" 35 ) 36 37 const ( 38 // Messages that provide detail about why a snapshot was rejected. 39 snapshotStoreTooFullMsg = "store almost out of disk space" 40 snapshotApplySemBusyMsg = "store busy applying snapshots" 41 storeDrainingMsg = "store is draining" 42 43 // IntersectingSnapshotMsg is part of the error message returned from 44 // canApplySnapshotLocked and is exposed here so testing can rely on it. 45 IntersectingSnapshotMsg = "snapshot intersects existing range" 46 ) 47 48 // incomingSnapshotStream is the minimal interface on a GRPC stream required 49 // to receive a snapshot over the network. 50 type incomingSnapshotStream interface { 51 Send(*SnapshotResponse) error 52 Recv() (*SnapshotRequest, error) 53 } 54 55 // outgoingSnapshotStream is the minimal interface on a GRPC stream required 56 // to send a snapshot over the network. 57 type outgoingSnapshotStream interface { 58 Send(*SnapshotRequest) error 59 Recv() (*SnapshotResponse, error) 60 } 61 62 // snapshotStrategy is an approach to sending and receiving Range snapshots. 63 // Each implementation corresponds to a SnapshotRequest_Strategy, and it is 64 // expected that the implementation that matches the Strategy specified in the 65 // snapshot header will always be used. 66 type snapshotStrategy interface { 67 // Receive streams SnapshotRequests in from the provided stream and 68 // constructs an IncomingSnapshot. 69 Receive(context.Context, incomingSnapshotStream, SnapshotRequest_Header) (IncomingSnapshot, error) 70 71 // Send streams SnapshotRequests created from the OutgoingSnapshot in to the 72 // provided stream. On nil error, the number of bytes sent is returned. 73 Send(context.Context, outgoingSnapshotStream, SnapshotRequest_Header, *OutgoingSnapshot) (int64, error) 74 75 // Status provides a status report on the work performed during the 76 // snapshot. Only valid if the strategy succeeded. 77 Status() string 78 79 // Close cleans up any resources associated with the snapshot strategy. 80 Close(context.Context) 81 } 82 83 func assertStrategy( 84 ctx context.Context, header SnapshotRequest_Header, expect SnapshotRequest_Strategy, 85 ) { 86 if header.Strategy != expect { 87 log.Fatalf(ctx, "expected strategy %s, found strategy %s", expect, header.Strategy) 88 } 89 } 90 91 // kvBatchSnapshotStrategy is an implementation of snapshotStrategy that streams 92 // batches of KV pairs in the BatchRepr format. 93 type kvBatchSnapshotStrategy struct { 94 raftCfg *base.RaftConfig 95 status string 96 97 // The size of the batches of PUT operations to send to the receiver of the 98 // snapshot. Only used on the sender side. 99 batchSize int64 100 // Limiter for sending KV batches. Only used on the sender side. 101 limiter *rate.Limiter 102 // Only used on the sender side. 103 newBatch func() storage.Batch 104 // bytesSent is updated in sendBatch and returned from Send(). It does not 105 // reflect the log entries sent (which are never sent in newer versions of 106 // CRDB, as of VersionUnreplicatedTruncatedState). 107 bytesSent int64 108 109 // The approximate size of the SST chunk to buffer in memory on the receiver 110 // before flushing to disk. Only used on the receiver side. 111 sstChunkSize int64 112 // Only used on the receiver side. 113 scratch *SSTSnapshotStorageScratch 114 } 115 116 // multiSSTWriter is a wrapper around RocksDBSstFileWriter and 117 // SSTSnapshotStorageScratch that handles chunking SSTs and persisting them to 118 // disk. 119 type multiSSTWriter struct { 120 scratch *SSTSnapshotStorageScratch 121 currSST storage.SSTWriter 122 keyRanges []rditer.KeyRange 123 currRange int 124 // The approximate size of the SST chunk to buffer in memory on the receiver 125 // before flushing to disk. 126 sstChunkSize int64 127 } 128 129 func newMultiSSTWriter( 130 ctx context.Context, 131 scratch *SSTSnapshotStorageScratch, 132 keyRanges []rditer.KeyRange, 133 sstChunkSize int64, 134 ) (multiSSTWriter, error) { 135 msstw := multiSSTWriter{ 136 scratch: scratch, 137 keyRanges: keyRanges, 138 sstChunkSize: sstChunkSize, 139 } 140 if err := msstw.initSST(ctx); err != nil { 141 return msstw, err 142 } 143 return msstw, nil 144 } 145 146 func (msstw *multiSSTWriter) initSST(ctx context.Context) error { 147 newSSTFile, err := msstw.scratch.NewFile(ctx, msstw.sstChunkSize) 148 if err != nil { 149 return errors.Wrap(err, "failed to create new sst file") 150 } 151 newSST := storage.MakeIngestionSSTWriter(newSSTFile) 152 msstw.currSST = newSST 153 if err := msstw.currSST.ClearRange(msstw.keyRanges[msstw.currRange].Start, msstw.keyRanges[msstw.currRange].End); err != nil { 154 msstw.currSST.Close() 155 return errors.Wrap(err, "failed to clear range on sst file writer") 156 } 157 return nil 158 } 159 160 func (msstw *multiSSTWriter) finalizeSST(ctx context.Context) error { 161 err := msstw.currSST.Finish() 162 if err != nil { 163 return errors.Wrap(err, "failed to finish sst") 164 } 165 msstw.currRange++ 166 msstw.currSST.Close() 167 return nil 168 } 169 170 func (msstw *multiSSTWriter) Put(ctx context.Context, key storage.MVCCKey, value []byte) error { 171 for msstw.keyRanges[msstw.currRange].End.Key.Compare(key.Key) <= 0 { 172 // Finish the current SST, write to the file, and move to the next key 173 // range. 174 if err := msstw.finalizeSST(ctx); err != nil { 175 return err 176 } 177 if err := msstw.initSST(ctx); err != nil { 178 return err 179 } 180 } 181 if msstw.keyRanges[msstw.currRange].Start.Key.Compare(key.Key) > 0 { 182 return crdberrors.AssertionFailedf("client error: expected %s to fall in one of %s", key.Key, msstw.keyRanges) 183 } 184 if err := msstw.currSST.Put(key, value); err != nil { 185 return errors.Wrap(err, "failed to put in sst") 186 } 187 return nil 188 } 189 190 func (msstw *multiSSTWriter) Finish(ctx context.Context) error { 191 if msstw.currRange < len(msstw.keyRanges) { 192 for { 193 if err := msstw.finalizeSST(ctx); err != nil { 194 return err 195 } 196 if msstw.currRange >= len(msstw.keyRanges) { 197 break 198 } 199 if err := msstw.initSST(ctx); err != nil { 200 return err 201 } 202 } 203 } 204 return nil 205 } 206 207 func (msstw *multiSSTWriter) Close() { 208 msstw.currSST.Close() 209 } 210 211 // Receive implements the snapshotStrategy interface. 212 // 213 // NOTE: This function assumes that the key-value pairs are sent in sorted 214 // order. The key-value pairs are sent in the following sorted order: 215 // 216 // 1. Replicated range-id local key range 217 // 2. Range-local key range 218 // 3. User key range 219 func (kvSS *kvBatchSnapshotStrategy) Receive( 220 ctx context.Context, stream incomingSnapshotStream, header SnapshotRequest_Header, 221 ) (IncomingSnapshot, error) { 222 assertStrategy(ctx, header, SnapshotRequest_KV_BATCH) 223 224 // At the moment we'll write at most three SSTs. 225 // TODO(jeffreyxiao): Re-evaluate as the default range size grows. 226 keyRanges := rditer.MakeReplicatedKeyRanges(header.State.Desc) 227 msstw, err := newMultiSSTWriter(ctx, kvSS.scratch, keyRanges, kvSS.sstChunkSize) 228 if err != nil { 229 return noSnap, err 230 } 231 defer msstw.Close() 232 var logEntries [][]byte 233 234 for { 235 req, err := stream.Recv() 236 if err != nil { 237 return noSnap, err 238 } 239 if req.Header != nil { 240 err := errors.New("client error: provided a header mid-stream") 241 return noSnap, sendSnapshotError(stream, err) 242 } 243 244 if req.KVBatch != nil { 245 batchReader, err := storage.NewRocksDBBatchReader(req.KVBatch) 246 if err != nil { 247 return noSnap, errors.Wrap(err, "failed to decode batch") 248 } 249 // All operations in the batch are guaranteed to be puts. 250 for batchReader.Next() { 251 if batchReader.BatchType() != storage.BatchTypeValue { 252 return noSnap, crdberrors.AssertionFailedf("expected type %d, found type %d", storage.BatchTypeValue, batchReader.BatchType()) 253 } 254 key, err := batchReader.MVCCKey() 255 if err != nil { 256 return noSnap, errors.Wrap(err, "failed to decode mvcc key") 257 } 258 if err := msstw.Put(ctx, key, batchReader.Value()); err != nil { 259 return noSnap, err 260 } 261 } 262 } 263 if req.LogEntries != nil { 264 logEntries = append(logEntries, req.LogEntries...) 265 } 266 if req.Final { 267 // We finished receiving all batches and log entries. It's possible that 268 // we did not receive any key-value pairs for some of the key ranges, but 269 // we must still construct SSTs with range deletion tombstones to remove 270 // the data. 271 if err := msstw.Finish(ctx); err != nil { 272 return noSnap, err 273 } 274 275 msstw.Close() 276 277 snapUUID, err := uuid.FromBytes(header.RaftMessageRequest.Message.Snapshot.Data) 278 if err != nil { 279 err = errors.Wrap(err, "client error: invalid snapshot") 280 return noSnap, sendSnapshotError(stream, err) 281 } 282 283 inSnap := IncomingSnapshot{ 284 UsesUnreplicatedTruncatedState: header.UnreplicatedTruncatedState, 285 SnapUUID: snapUUID, 286 SSTStorageScratch: kvSS.scratch, 287 LogEntries: logEntries, 288 State: &header.State, 289 snapType: header.Type, 290 } 291 292 expLen := inSnap.State.RaftAppliedIndex - inSnap.State.TruncatedState.Index 293 if expLen != uint64(len(logEntries)) { 294 // We've received a botched snapshot. We could fatal right here but opt 295 // to warn loudly instead, and fatal when applying the snapshot 296 // (in Replica.applySnapshot) in order to capture replica hard state. 297 log.Warningf(ctx, 298 "missing log entries in snapshot (%s): got %d entries, expected %d", 299 inSnap.String(), len(logEntries), expLen) 300 } 301 302 kvSS.status = fmt.Sprintf("log entries: %d, ssts: %d", len(logEntries), len(kvSS.scratch.SSTs())) 303 return inSnap, nil 304 } 305 } 306 } 307 308 // errMalformedSnapshot indicates that the snapshot in question is malformed, 309 // for e.g. missing raft log entries. 310 var errMalformedSnapshot = errors.New("malformed snapshot generated") 311 312 // Send implements the snapshotStrategy interface. 313 func (kvSS *kvBatchSnapshotStrategy) Send( 314 ctx context.Context, 315 stream outgoingSnapshotStream, 316 header SnapshotRequest_Header, 317 snap *OutgoingSnapshot, 318 ) (int64, error) { 319 assertStrategy(ctx, header, SnapshotRequest_KV_BATCH) 320 321 // Iterate over all keys using the provided iterator and stream out batches 322 // of key-values. 323 n := 0 324 var b storage.Batch 325 for iter := snap.Iter; ; iter.Next() { 326 if ok, err := iter.Valid(); err != nil { 327 return 0, err 328 } else if !ok { 329 break 330 } 331 key := iter.Key() 332 value := iter.Value() 333 n++ 334 if b == nil { 335 b = kvSS.newBatch() 336 } 337 if err := b.Put(key, value); err != nil { 338 b.Close() 339 return 0, err 340 } 341 342 if int64(b.Len()) >= kvSS.batchSize { 343 if err := kvSS.sendBatch(ctx, stream, b); err != nil { 344 return 0, err 345 } 346 b = nil 347 // We no longer need the keys and values in the batch we just sent, 348 // so reset ReplicaDataIterator's allocator and allow its data to 349 // be garbage collected. 350 iter.ResetAllocator() 351 } 352 } 353 if b != nil { 354 if err := kvSS.sendBatch(ctx, stream, b); err != nil { 355 return 0, err 356 } 357 } 358 359 // Iterate over the specified range of Raft entries and send them all out 360 // together. 361 firstIndex := header.State.TruncatedState.Index + 1 362 endIndex := snap.RaftSnap.Metadata.Index + 1 363 preallocSize := endIndex - firstIndex 364 const maxPreallocSize = 1000 365 if preallocSize > maxPreallocSize { 366 // It's possible for the raft log to become enormous in certain 367 // sustained failure conditions. We may bail out of the snapshot 368 // process early in scanFunc, but in the worst case this 369 // preallocation is enough to run the server out of memory. Limit 370 // the size of the buffer we will preallocate. 371 preallocSize = maxPreallocSize 372 } 373 logEntries := make([][]byte, 0, preallocSize) 374 375 var raftLogBytes int64 376 scanFunc := func(kv roachpb.KeyValue) (bool, error) { 377 bytes, err := kv.Value.GetBytes() 378 if err == nil { 379 logEntries = append(logEntries, bytes) 380 raftLogBytes += int64(len(bytes)) 381 } 382 return false, err 383 } 384 385 rangeID := header.State.Desc.RangeID 386 387 if err := iterateEntries(ctx, snap.EngineSnap, rangeID, firstIndex, endIndex, scanFunc); err != nil { 388 return 0, err 389 } 390 391 // The difference between the snapshot index (applied index at the time of 392 // snapshot) and the truncated index should equal the number of log entries 393 // shipped over. 394 expLen := endIndex - firstIndex 395 if expLen != uint64(len(logEntries)) { 396 // We've generated a botched snapshot. We could fatal right here but opt 397 // to warn loudly instead, and fatal at the caller to capture a checkpoint 398 // of the underlying storage engine. 399 entriesRange, err := extractRangeFromEntries(logEntries) 400 if err != nil { 401 return 0, err 402 } 403 log.Warningf(ctx, "missing log entries in snapshot (%s): "+ 404 "got %d entries, expected %d (TruncatedState.Index=%d, LogEntries=%s)", 405 snap.String(), len(logEntries), expLen, snap.State.TruncatedState.Index, entriesRange) 406 return 0, errMalformedSnapshot 407 } 408 409 // Inline the payloads for all sideloaded proposals. 410 // 411 // TODO(tschottdorf): could also send slim proposals and attach sideloaded 412 // SSTables directly to the snapshot. Probably the better long-term 413 // solution, but let's see if it ever becomes relevant. Snapshots with 414 // inlined proposals are hopefully the exception. 415 { 416 var ent raftpb.Entry 417 for i := range logEntries { 418 if err := protoutil.Unmarshal(logEntries[i], &ent); err != nil { 419 return 0, err 420 } 421 if !sniffSideloadedRaftCommand(ent.Data) { 422 continue 423 } 424 if err := snap.WithSideloaded(func(ss SideloadStorage) error { 425 newEnt, err := maybeInlineSideloadedRaftCommand( 426 ctx, rangeID, ent, ss, snap.RaftEntryCache, 427 ) 428 if err != nil { 429 return err 430 } 431 if newEnt != nil { 432 ent = *newEnt 433 } 434 return nil 435 }); err != nil { 436 if errors.Is(err, errSideloadedFileNotFound) { 437 // We're creating the Raft snapshot based on a snapshot of 438 // the engine, but the Raft log may since have been 439 // truncated and corresponding on-disk sideloaded payloads 440 // unlinked. Luckily, we can just abort this snapshot; the 441 // caller can retry. 442 // 443 // TODO(tschottdorf): check how callers handle this. They 444 // should simply retry. In some scenarios, perhaps this can 445 // happen repeatedly and prevent a snapshot; not sending the 446 // log entries wouldn't help, though, and so we'd really 447 // need to make sure the entries are always here, for 448 // instance by pre-loading them into memory. Or we can make 449 // log truncation less aggressive about removing sideloaded 450 // files, by delaying trailing file deletion for a bit. 451 return 0, &errMustRetrySnapshotDueToTruncation{ 452 index: ent.Index, 453 term: ent.Term, 454 } 455 } 456 return 0, err 457 } 458 // TODO(tschottdorf): it should be possible to reuse `logEntries[i]` here. 459 var err error 460 if logEntries[i], err = protoutil.Marshal(&ent); err != nil { 461 return 0, err 462 } 463 } 464 } 465 kvSS.status = fmt.Sprintf("kv pairs: %d, log entries: %d", n, len(logEntries)) 466 if err := stream.Send(&SnapshotRequest{LogEntries: logEntries}); err != nil { 467 return 0, err 468 } 469 return kvSS.bytesSent, nil 470 } 471 472 func (kvSS *kvBatchSnapshotStrategy) sendBatch( 473 ctx context.Context, stream outgoingSnapshotStream, batch storage.Batch, 474 ) error { 475 if err := kvSS.limiter.WaitN(ctx, 1); err != nil { 476 return err 477 } 478 repr := batch.Repr() 479 kvSS.batchSize += int64(len(repr)) 480 batch.Close() 481 return stream.Send(&SnapshotRequest{KVBatch: repr}) 482 } 483 484 // Status implements the snapshotStrategy interface. 485 func (kvSS *kvBatchSnapshotStrategy) Status() string { return kvSS.status } 486 487 // Close implements the snapshotStrategy interface. 488 func (kvSS *kvBatchSnapshotStrategy) Close(ctx context.Context) { 489 if kvSS.scratch != nil { 490 // A failure to clean up the storage is benign except that it will leak 491 // disk space (which is reclaimed on node restart). It is unexpected 492 // though, so log a warning. 493 if err := kvSS.scratch.Clear(); err != nil { 494 log.Warningf(ctx, "error closing kvBatchSnapshotStrategy: %v", err) 495 } 496 } 497 } 498 499 // reserveSnapshot throttles incoming snapshots. The returned closure is used 500 // to cleanup the reservation and release its resources. A nil cleanup function 501 // and a non-empty rejectionMessage indicates the reservation was declined. 502 func (s *Store) reserveSnapshot( 503 ctx context.Context, header *SnapshotRequest_Header, 504 ) (_cleanup func(), _rejectionMsg string, _err error) { 505 tBegin := timeutil.Now() 506 if header.RangeSize == 0 { 507 // Empty snapshots are exempt from rate limits because they're so cheap to 508 // apply. This vastly speeds up rebalancing any empty ranges created by a 509 // RESTORE or manual SPLIT AT, since it prevents these empty snapshots from 510 // getting stuck behind large snapshots managed by the replicate queue. 511 } else if header.CanDecline { 512 storeDesc, ok := s.cfg.StorePool.getStoreDescriptor(s.StoreID()) 513 if ok && (!maxCapacityCheck(storeDesc) || header.RangeSize > storeDesc.Capacity.Available) { 514 return nil, snapshotStoreTooFullMsg, nil 515 } 516 select { 517 case s.snapshotApplySem <- struct{}{}: 518 case <-ctx.Done(): 519 return nil, "", ctx.Err() 520 case <-s.stopper.ShouldStop(): 521 return nil, "", errors.Errorf("stopped") 522 default: 523 return nil, snapshotApplySemBusyMsg, nil 524 } 525 } else { 526 select { 527 case s.snapshotApplySem <- struct{}{}: 528 case <-ctx.Done(): 529 return nil, "", ctx.Err() 530 case <-s.stopper.ShouldStop(): 531 return nil, "", errors.Errorf("stopped") 532 } 533 } 534 535 // The choice here is essentially arbitrary, but with a default range size of 64mb and the 536 // Raft snapshot rate limiting of 8mb/s, we expect to spend less than 8s per snapshot. 537 // Preemptive snapshots are limited to 2mb/s (by default), so they can take up to 4x longer, 538 // but an average range is closer to 32mb, so we expect ~16s for larger preemptive snapshots, 539 // which is what we want to log. 540 const snapshotReservationWaitWarnThreshold = 13 * time.Second 541 if elapsed := timeutil.Since(tBegin); elapsed > snapshotReservationWaitWarnThreshold { 542 replDesc, _ := header.State.Desc.GetReplicaDescriptor(s.StoreID()) 543 log.Infof( 544 ctx, 545 "waited for %.1fs to acquire snapshot reservation to r%d/%d", 546 elapsed.Seconds(), 547 header.State.Desc.RangeID, 548 replDesc.ReplicaID, 549 ) 550 } 551 552 s.metrics.ReservedReplicaCount.Inc(1) 553 s.metrics.Reserved.Inc(header.RangeSize) 554 return func() { 555 s.metrics.ReservedReplicaCount.Dec(1) 556 s.metrics.Reserved.Dec(header.RangeSize) 557 if header.RangeSize != 0 { 558 <-s.snapshotApplySem 559 } 560 }, "", nil 561 } 562 563 // canApplySnapshotLocked returns (_, nil) if the snapshot can be applied to 564 // this store's replica (i.e. the snapshot is not from an older incarnation of 565 // the replica) and a placeholder can be added to the replicasByKey map (if 566 // necessary). If a placeholder is required, it is returned as the first value. 567 // 568 // Both the store mu (and the raft mu for an existing replica if there is one) 569 // must be held. 570 func (s *Store) canApplySnapshotLocked( 571 ctx context.Context, snapHeader *SnapshotRequest_Header, 572 ) (*ReplicaPlaceholder, error) { 573 if snapHeader.IsPreemptive() { 574 return nil, crdberrors.AssertionFailedf(`expected a raft or learner snapshot`) 575 } 576 577 // TODO(tbg): see the comment on desc.Generation for what seems to be a much 578 // saner way to handle overlap via generational semantics. 579 desc := *snapHeader.State.Desc 580 581 // First, check for an existing Replica. 582 v, ok := s.mu.replicas.Load( 583 int64(desc.RangeID), 584 ) 585 if !ok { 586 return nil, errors.Errorf("canApplySnapshotLocked requires a replica present") 587 } 588 existingRepl := (*Replica)(v) 589 // The raftMu is held which allows us to use the existing replica as a 590 // placeholder when we decide that the snapshot can be applied. As long 591 // as the caller releases the raftMu only after feeding the snapshot 592 // into the replica, this is safe. 593 existingRepl.raftMu.AssertHeld() 594 595 existingRepl.mu.RLock() 596 existingDesc := existingRepl.mu.state.Desc 597 existingIsInitialized := existingDesc.IsInitialized() 598 existingDestroyStatus := existingRepl.mu.destroyStatus 599 existingRepl.mu.RUnlock() 600 601 if existingIsInitialized { 602 // Regular Raft snapshots can't be refused at this point, 603 // even if they widen the existing replica. See the comments 604 // in Replica.maybeAcquireSnapshotMergeLock for how this is 605 // made safe. 606 // 607 // NB: The snapshot must be intended for this replica as 608 // withReplicaForRequest ensures that requests with a non-zero replica 609 // id are passed to a replica with a matching id. Given this is not a 610 // preemptive snapshot we know that its id must be non-zero. 611 return nil, nil 612 } 613 614 // If we are not alive then we should not apply a snapshot as our removal 615 // is imminent. 616 if existingDestroyStatus.Removed() { 617 return nil, existingDestroyStatus.err 618 } 619 620 // We have a key range [desc.StartKey,desc.EndKey) which we want to apply a 621 // snapshot for. Is there a conflicting existing placeholder or an 622 // overlapping range? 623 if err := s.checkSnapshotOverlapLocked(ctx, snapHeader); err != nil { 624 return nil, err 625 } 626 627 placeholder := &ReplicaPlaceholder{ 628 rangeDesc: desc, 629 } 630 return placeholder, nil 631 } 632 633 // checkSnapshotOverlapLocked returns an error if the snapshot overlaps an 634 // existing replica or placeholder. Any replicas that do overlap have a good 635 // chance of being abandoned, so they're proactively handed to the GC queue . 636 func (s *Store) checkSnapshotOverlapLocked( 637 ctx context.Context, snapHeader *SnapshotRequest_Header, 638 ) error { 639 desc := *snapHeader.State.Desc 640 641 // NB: this check seems redundant since placeholders are also represented in 642 // replicasByKey (and thus returned in getOverlappingKeyRangeLocked). 643 if exRng, ok := s.mu.replicaPlaceholders[desc.RangeID]; ok { 644 return errors.Errorf("%s: canApplySnapshotLocked: cannot add placeholder, have an existing placeholder %s %v", s, exRng, snapHeader.RaftMessageRequest.FromReplica) 645 } 646 647 // TODO(benesch): consider discovering and GC'ing *all* overlapping ranges, 648 // not just the first one that getOverlappingKeyRangeLocked happens to return. 649 if exRange := s.getOverlappingKeyRangeLocked(&desc); exRange != nil { 650 // We have a conflicting range, so we must block the snapshot. 651 // When such a conflict exists, it will be resolved by one range 652 // either being split or garbage collected. 653 exReplica, err := s.GetReplica(exRange.Desc().RangeID) 654 msg := IntersectingSnapshotMsg 655 if err != nil { 656 log.Warningf(ctx, "unable to look up overlapping replica on %s: %v", exReplica, err) 657 } else { 658 inactive := func(r *Replica) bool { 659 if r.RaftStatus() == nil { 660 return true 661 } 662 // TODO(benesch): this check does detect inactivity on replicas with 663 // epoch-based leases. Since the validity of an epoch-based lease is 664 // tied to the owning node's liveness, the lease can be valid well after 665 // the leader of the range has cut off communication with this replica. 666 // Expiration based leases, by contrast, will expire quickly if the 667 // leader of the range stops sending this replica heartbeats. 668 lease, pendingLease := r.GetLease() 669 now := s.Clock().Now() 670 return !r.IsLeaseValid(lease, now) && 671 (pendingLease == (roachpb.Lease{}) || !r.IsLeaseValid(pendingLease, now)) 672 } 673 // We unconditionally send this replica through the GC queue. It's 674 // reasonably likely that the GC queue will do nothing because the replica 675 // needs to split instead, but better to err on the side of queueing too 676 // frequently. Blocking Raft snapshots for too long can wedge a cluster, 677 // and if the replica does need to be GC'd, this might be the only code 678 // path that notices in a timely fashion. 679 // 680 // We're careful to avoid starving out other replicas in the GC queue by 681 // queueing at a low priority unless we can prove that the range is 682 // inactive and thus unlikely to be about to process a split. 683 gcPriority := replicaGCPriorityDefault 684 if inactive(exReplica) { 685 gcPriority = replicaGCPrioritySuspect 686 } 687 688 msg += "; initiated GC:" 689 s.replicaGCQueue.AddAsync(ctx, exReplica, gcPriority) 690 } 691 return errors.Errorf("%s %v (incoming %v)", msg, exReplica, snapHeader.State.Desc.RSpan()) // exReplica can be nil 692 } 693 return nil 694 } 695 696 // shouldAcceptSnapshotData is an optimization to check whether we should even 697 // bother to read the data for an incoming snapshot. If the snapshot overlaps an 698 // existing replica or placeholder, we'd error during application anyway, so do 699 // it before transferring all the data. This method is a guess and may have 700 // false positives. If the snapshot should be rejected, an error is returned 701 // with a description of why. Otherwise, nil means we should accept the 702 // snapshot. 703 func (s *Store) shouldAcceptSnapshotData( 704 ctx context.Context, snapHeader *SnapshotRequest_Header, 705 ) error { 706 if snapHeader.IsPreemptive() { 707 return crdberrors.AssertionFailedf(`expected a raft or learner snapshot`) 708 } 709 pErr := s.withReplicaForRequest(ctx, &snapHeader.RaftMessageRequest, 710 func(ctx context.Context, r *Replica) *roachpb.Error { 711 // If the current replica is not initialized then we should accept this 712 // snapshot if it doesn't overlap existing ranges. 713 if !r.IsInitialized() { 714 s.mu.Lock() 715 defer s.mu.Unlock() 716 return roachpb.NewError(s.checkSnapshotOverlapLocked(ctx, snapHeader)) 717 } 718 // If the current range is initialized then we need to accept this 719 // snapshot. 720 return nil 721 }) 722 return pErr.GoError() 723 } 724 725 // receiveSnapshot receives an incoming snapshot via a pre-opened GRPC stream. 726 func (s *Store) receiveSnapshot( 727 ctx context.Context, header *SnapshotRequest_Header, stream incomingSnapshotStream, 728 ) error { 729 if fn := s.cfg.TestingKnobs.ReceiveSnapshot; fn != nil { 730 if err := fn(header); err != nil { 731 return sendSnapshotError(stream, err) 732 } 733 } 734 735 if header.IsPreemptive() { 736 return crdberrors.AssertionFailedf(`expected a raft or learner snapshot`) 737 } 738 739 // Defensive check that any snapshot contains this store in the descriptor. 740 storeID := s.StoreID() 741 if _, ok := header.State.Desc.GetReplicaDescriptor(storeID); !ok { 742 return crdberrors.AssertionFailedf( 743 `snapshot of type %s was sent to s%d which did not contain it as a replica: %s`, 744 header.Type, storeID, header.State.Desc.Replicas()) 745 } 746 747 cleanup, rejectionMsg, err := s.reserveSnapshot(ctx, header) 748 if err != nil { 749 return err 750 } 751 if cleanup == nil { 752 return stream.Send(&SnapshotResponse{ 753 Status: SnapshotResponse_DECLINED, 754 Message: rejectionMsg, 755 }) 756 } 757 defer cleanup() 758 759 // Check to see if the snapshot can be applied but don't attempt to add 760 // a placeholder here, because we're not holding the replica's raftMu. 761 // We'll perform this check again later after receiving the rest of the 762 // snapshot data - this is purely an optimization to prevent downloading 763 // a snapshot that we know we won't be able to apply. 764 if err := s.shouldAcceptSnapshotData(ctx, header); err != nil { 765 return sendSnapshotError(stream, 766 errors.Wrapf(err, "%s,r%d: cannot apply snapshot", s, header.State.Desc.RangeID), 767 ) 768 } 769 770 // Determine which snapshot strategy the sender is using to send this 771 // snapshot. If we don't know how to handle the specified strategy, return 772 // an error. 773 var ss snapshotStrategy 774 switch header.Strategy { 775 case SnapshotRequest_KV_BATCH: 776 snapUUID, err := uuid.FromBytes(header.RaftMessageRequest.Message.Snapshot.Data) 777 if err != nil { 778 err = errors.Wrap(err, "invalid snapshot") 779 return sendSnapshotError(stream, err) 780 } 781 782 ss = &kvBatchSnapshotStrategy{ 783 raftCfg: &s.cfg.RaftConfig, 784 scratch: s.sstSnapshotStorage.NewScratchSpace(header.State.Desc.RangeID, snapUUID), 785 sstChunkSize: snapshotSSTWriteSyncRate.Get(&s.cfg.Settings.SV), 786 } 787 defer ss.Close(ctx) 788 default: 789 return sendSnapshotError(stream, 790 errors.Errorf("%s,r%d: unknown snapshot strategy: %s", 791 s, header.State.Desc.RangeID, header.Strategy), 792 ) 793 } 794 795 if err := stream.Send(&SnapshotResponse{Status: SnapshotResponse_ACCEPTED}); err != nil { 796 return err 797 } 798 if log.V(2) { 799 log.Infof(ctx, "accepted snapshot reservation for r%d", header.State.Desc.RangeID) 800 } 801 802 inSnap, err := ss.Receive(ctx, stream, *header) 803 if err != nil { 804 return err 805 } 806 if err := s.processRaftSnapshotRequest(ctx, header, inSnap); err != nil { 807 return sendSnapshotError(stream, errors.Wrap(err.GoError(), "failed to apply snapshot")) 808 } 809 810 return stream.Send(&SnapshotResponse{Status: SnapshotResponse_APPLIED}) 811 } 812 813 func sendSnapshotError(stream incomingSnapshotStream, err error) error { 814 return stream.Send(&SnapshotResponse{ 815 Status: SnapshotResponse_ERROR, 816 Message: err.Error(), 817 }) 818 } 819 820 // SnapshotStorePool narrows StorePool to make sendSnapshot easier to test. 821 type SnapshotStorePool interface { 822 throttle(reason throttleReason, why string, toStoreID roachpb.StoreID) 823 } 824 825 // validatePositive is a function to validate that a settings value is positive. 826 func validatePositive(v int64) error { 827 if v <= 0 { 828 return errors.Errorf("%d is not positive", v) 829 } 830 return nil 831 } 832 833 // rebalanceSnapshotRate is the rate at which preemptive snapshots can be sent. 834 // This includes snapshots generated for upreplication or for rebalancing. 835 var rebalanceSnapshotRate = settings.RegisterPublicValidatedByteSizeSetting( 836 "kv.snapshot_rebalance.max_rate", 837 "the rate limit (bytes/sec) to use for rebalance and upreplication snapshots", 838 envutil.EnvOrDefaultBytes("COCKROACH_PREEMPTIVE_SNAPSHOT_RATE", 8<<20), 839 validatePositive, 840 ) 841 842 // recoverySnapshotRate is the rate at which Raft-initiated spanshots can be 843 // sent. Ideally, one would never see a Raft-initiated snapshot; we'd like all 844 // the snapshots to be preemptive. However, it has proved unfeasible to 845 // completely get rid of them. 846 // TODO(tbg): The existence of this rate, separate from rebalanceSnapshotRate, 847 // does not make a whole lot of sense. 848 var recoverySnapshotRate = settings.RegisterPublicValidatedByteSizeSetting( 849 "kv.snapshot_recovery.max_rate", 850 "the rate limit (bytes/sec) to use for recovery snapshots", 851 envutil.EnvOrDefaultBytes("COCKROACH_RAFT_SNAPSHOT_RATE", 8<<20), 852 validatePositive, 853 ) 854 855 // snapshotSSTWriteSyncRate is the size of chunks to write before fsync-ing. 856 // The default of 2 MiB was chosen to be in line with the behavior in bulk-io. 857 // See sstWriteSyncRate. 858 var snapshotSSTWriteSyncRate = settings.RegisterByteSizeSetting( 859 "kv.snapshot_sst.sync_size", 860 "threshold after which snapshot SST writes must fsync", 861 2<<20, /* 2 MiB */ 862 ) 863 864 func snapshotRateLimit( 865 st *cluster.Settings, priority SnapshotRequest_Priority, 866 ) (rate.Limit, error) { 867 switch priority { 868 case SnapshotRequest_RECOVERY: 869 return rate.Limit(recoverySnapshotRate.Get(&st.SV)), nil 870 case SnapshotRequest_REBALANCE: 871 return rate.Limit(rebalanceSnapshotRate.Get(&st.SV)), nil 872 default: 873 return 0, errors.Errorf("unknown snapshot priority: %s", priority) 874 } 875 } 876 877 type errMustRetrySnapshotDueToTruncation struct { 878 index, term uint64 879 } 880 881 func (e *errMustRetrySnapshotDueToTruncation) Error() string { 882 return fmt.Sprintf( 883 "log truncation during snapshot removed sideloaded SSTable at index %d, term %d", 884 e.index, e.term, 885 ) 886 } 887 888 // sendSnapshot sends an outgoing snapshot via a pre-opened GRPC stream. 889 func sendSnapshot( 890 ctx context.Context, 891 raftCfg *base.RaftConfig, 892 st *cluster.Settings, 893 stream outgoingSnapshotStream, 894 storePool SnapshotStorePool, 895 header SnapshotRequest_Header, 896 snap *OutgoingSnapshot, 897 newBatch func() storage.Batch, 898 sent func(), 899 ) error { 900 start := timeutil.Now() 901 to := header.RaftMessageRequest.ToReplica 902 if err := stream.Send(&SnapshotRequest{Header: &header}); err != nil { 903 return err 904 } 905 // Wait until we get a response from the server. The recipient may queue us 906 // (only a limited number of snapshots are allowed concurrently) or flat-out 907 // reject the snapshot. After the initial message exchange, we'll go and send 908 // the actual snapshot (if not rejected). 909 resp, err := stream.Recv() 910 if err != nil { 911 storePool.throttle(throttleFailed, err.Error(), to.StoreID) 912 return err 913 } 914 switch resp.Status { 915 case SnapshotResponse_DECLINED: 916 if header.CanDecline { 917 declinedMsg := "reservation rejected" 918 if len(resp.Message) > 0 { 919 declinedMsg = resp.Message 920 } 921 err := &benignError{errors.Errorf("%s: remote declined %s: %s", to, snap, declinedMsg)} 922 storePool.throttle(throttleDeclined, err.Error(), to.StoreID) 923 return err 924 } 925 err := errors.Errorf("%s: programming error: remote declined required %s: %s", 926 to, snap, resp.Message) 927 storePool.throttle(throttleFailed, err.Error(), to.StoreID) 928 return err 929 case SnapshotResponse_ERROR: 930 storePool.throttle(throttleFailed, resp.Message, to.StoreID) 931 return errors.Errorf("%s: remote couldn't accept %s with error: %s", 932 to, snap, resp.Message) 933 case SnapshotResponse_ACCEPTED: 934 // This is the response we're expecting. Continue with snapshot sending. 935 default: 936 err := errors.Errorf("%s: server sent an invalid status while negotiating %s: %s", 937 to, snap, resp.Status) 938 storePool.throttle(throttleFailed, err.Error(), to.StoreID) 939 return err 940 } 941 942 durQueued := timeutil.Since(start) 943 start = timeutil.Now() 944 945 // The size of batches to send. This is the granularity of rate limiting. 946 const batchSize = 256 << 10 // 256 KB 947 targetRate, err := snapshotRateLimit(st, header.Priority) 948 if err != nil { 949 return errors.Wrapf(err, "%s", to) 950 } 951 952 // Convert the bytes/sec rate limit to batches/sec. 953 // 954 // TODO(peter): Using bytes/sec for rate limiting seems more natural but has 955 // practical difficulties. We either need to use a very large burst size 956 // which seems to disable the rate limiting, or call WaitN in smaller than 957 // burst size chunks which caused excessive slowness in testing. Would be 958 // nice to figure this out, but the batches/sec rate limit works for now. 959 limiter := rate.NewLimiter(targetRate/batchSize, 1 /* burst size */) 960 961 // Create a snapshotStrategy based on the desired snapshot strategy. 962 var ss snapshotStrategy 963 switch header.Strategy { 964 case SnapshotRequest_KV_BATCH: 965 ss = &kvBatchSnapshotStrategy{ 966 raftCfg: raftCfg, 967 batchSize: batchSize, 968 limiter: limiter, 969 newBatch: newBatch, 970 } 971 default: 972 log.Fatalf(ctx, "unknown snapshot strategy: %s", header.Strategy) 973 } 974 975 numBytesSent, err := ss.Send(ctx, stream, header, snap) 976 if err != nil { 977 return err 978 } 979 durSent := timeutil.Since(start) 980 981 // Notify the sent callback before the final snapshot request is sent so that 982 // the snapshots generated metric gets incremented before the snapshot is 983 // applied. 984 sent() 985 if err := stream.Send(&SnapshotRequest{Final: true}); err != nil { 986 return err 987 } 988 log.Infof( 989 ctx, 990 "streamed %s to %s in %.2fs @ %s/s: %s, rate-limit: %s/s, queued: %.2fs", 991 snap, 992 to, 993 durSent.Seconds(), 994 humanizeutil.IBytes(int64(float64(numBytesSent)/durSent.Seconds())), 995 ss.Status(), 996 humanizeutil.IBytes(int64(targetRate)), 997 durQueued.Seconds(), 998 ) 999 1000 resp, err = stream.Recv() 1001 if err != nil { 1002 return errors.Wrapf(err, "%s: remote failed to apply snapshot", to) 1003 } 1004 // NB: wait for EOF which ensures that all processing on the server side has 1005 // completed (such as defers that might be run after the previous message was 1006 // received). 1007 if unexpectedResp, err := stream.Recv(); err != io.EOF { 1008 return errors.Errorf("%s: expected EOF, got resp=%v err=%v", to, unexpectedResp, err) 1009 } 1010 switch resp.Status { 1011 case SnapshotResponse_ERROR: 1012 return errors.Errorf("%s: remote failed to apply snapshot for reason %s", to, resp.Message) 1013 case SnapshotResponse_APPLIED: 1014 return nil 1015 default: 1016 return errors.Errorf("%s: server sent an invalid status during finalization: %s", 1017 to, resp.Status) 1018 } 1019 }