github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica_raftstorage.go (about) 1 // Copyright 2015 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver 12 13 import ( 14 "context" 15 "fmt" 16 "math" 17 "sync/atomic" 18 "time" 19 20 "github.com/cockroachdb/cockroach/pkg/keys" 21 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase" 22 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb" 23 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/raftentry" 24 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/rditer" 25 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/stateloader" 26 "github.com/cockroachdb/cockroach/pkg/roachpb" 27 "github.com/cockroachdb/cockroach/pkg/storage" 28 "github.com/cockroachdb/cockroach/pkg/storage/enginepb" 29 "github.com/cockroachdb/cockroach/pkg/util/hlc" 30 "github.com/cockroachdb/cockroach/pkg/util/log" 31 "github.com/cockroachdb/cockroach/pkg/util/protoutil" 32 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 33 "github.com/cockroachdb/cockroach/pkg/util/uuid" 34 "github.com/cockroachdb/errors" 35 "go.etcd.io/etcd/raft" 36 "go.etcd.io/etcd/raft/raftpb" 37 ) 38 39 // replicaRaftStorage implements the raft.Storage interface. 40 type replicaRaftStorage Replica 41 42 var _ raft.Storage = (*replicaRaftStorage)(nil) 43 44 // All calls to raft.RawNode require that both Replica.raftMu and 45 // Replica.mu are held. All of the functions exposed via the 46 // raft.Storage interface will in turn be called from RawNode, so none 47 // of these methods may acquire either lock, but they may require 48 // their caller to hold one or both locks (even though they do not 49 // follow our "Locked" naming convention). Specific locking 50 // requirements are noted in each method's comments. 51 // 52 // Many of the methods defined in this file are wrappers around static 53 // functions. This is done to facilitate their use from 54 // Replica.Snapshot(), where it is important that all the data that 55 // goes into the snapshot comes from a consistent view of the 56 // database, and not the replica's in-memory state or via a reference 57 // to Replica.store.Engine(). 58 59 // InitialState implements the raft.Storage interface. 60 // InitialState requires that r.mu is held. 61 func (r *replicaRaftStorage) InitialState() (raftpb.HardState, raftpb.ConfState, error) { 62 ctx := r.AnnotateCtx(context.TODO()) 63 hs, err := r.mu.stateLoader.LoadHardState(ctx, r.store.Engine()) 64 // For uninitialized ranges, membership is unknown at this point. 65 if raft.IsEmptyHardState(hs) || err != nil { 66 return raftpb.HardState{}, raftpb.ConfState{}, err 67 } 68 cs := r.mu.state.Desc.Replicas().ConfState() 69 return hs, cs, nil 70 } 71 72 // Entries implements the raft.Storage interface. Note that maxBytes is advisory 73 // and this method will always return at least one entry even if it exceeds 74 // maxBytes. Sideloaded proposals count towards maxBytes with their payloads inlined. 75 func (r *replicaRaftStorage) Entries(lo, hi, maxBytes uint64) ([]raftpb.Entry, error) { 76 readonly := r.store.Engine().NewReadOnly() 77 defer readonly.Close() 78 ctx := r.AnnotateCtx(context.TODO()) 79 if r.raftMu.sideloaded == nil { 80 return nil, errors.New("sideloaded storage is uninitialized") 81 } 82 return entries(ctx, r.mu.stateLoader, readonly, r.RangeID, r.store.raftEntryCache, 83 r.raftMu.sideloaded, lo, hi, maxBytes) 84 } 85 86 // raftEntriesLocked requires that r.mu is held. 87 func (r *Replica) raftEntriesLocked(lo, hi, maxBytes uint64) ([]raftpb.Entry, error) { 88 return (*replicaRaftStorage)(r).Entries(lo, hi, maxBytes) 89 } 90 91 // entries retrieves entries from the engine. To accommodate loading the term, 92 // `sideloaded` can be supplied as nil, in which case sideloaded entries will 93 // not be inlined, the raft entry cache will not be populated with *any* of the 94 // loaded entries, and maxBytes will not be applied to the payloads. 95 func entries( 96 ctx context.Context, 97 rsl stateloader.StateLoader, 98 reader storage.Reader, 99 rangeID roachpb.RangeID, 100 eCache *raftentry.Cache, 101 sideloaded SideloadStorage, 102 lo, hi, maxBytes uint64, 103 ) ([]raftpb.Entry, error) { 104 if lo > hi { 105 return nil, errors.Errorf("lo:%d is greater than hi:%d", lo, hi) 106 } 107 108 n := hi - lo 109 if n > 100 { 110 n = 100 111 } 112 ents := make([]raftpb.Entry, 0, n) 113 114 ents, size, hitIndex, exceededMaxBytes := eCache.Scan(ents, rangeID, lo, hi, maxBytes) 115 116 // Return results if the correct number of results came back or if 117 // we ran into the max bytes limit. 118 if uint64(len(ents)) == hi-lo || exceededMaxBytes { 119 return ents, nil 120 } 121 122 // Scan over the log to find the requested entries in the range [lo, hi), 123 // stopping once we have enough. 124 expectedIndex := hitIndex 125 126 // Whether we can populate the Raft entries cache. False if we found a 127 // sideloaded proposal, but the caller didn't give us a sideloaded storage. 128 canCache := true 129 130 var ent raftpb.Entry 131 scanFunc := func(kv roachpb.KeyValue) (bool, error) { 132 if err := kv.Value.GetProto(&ent); err != nil { 133 return false, err 134 } 135 // Exit early if we have any gaps or it has been compacted. 136 if ent.Index != expectedIndex { 137 return true, nil 138 } 139 expectedIndex++ 140 141 if sniffSideloadedRaftCommand(ent.Data) { 142 canCache = canCache && sideloaded != nil 143 if sideloaded != nil { 144 newEnt, err := maybeInlineSideloadedRaftCommand( 145 ctx, rangeID, ent, sideloaded, eCache, 146 ) 147 if err != nil { 148 return true, err 149 } 150 if newEnt != nil { 151 ent = *newEnt 152 } 153 } 154 } 155 156 // Note that we track the size of proposals with payloads inlined. 157 size += uint64(ent.Size()) 158 if size > maxBytes { 159 exceededMaxBytes = true 160 if len(ents) > 0 { 161 return exceededMaxBytes, nil 162 } 163 } 164 ents = append(ents, ent) 165 return exceededMaxBytes, nil 166 } 167 168 if err := iterateEntries(ctx, reader, rangeID, expectedIndex, hi, scanFunc); err != nil { 169 return nil, err 170 } 171 // Cache the fetched entries, if we may. 172 if canCache { 173 eCache.Add(rangeID, ents, false /* truncate */) 174 } 175 176 // Did the correct number of results come back? If so, we're all good. 177 if uint64(len(ents)) == hi-lo { 178 return ents, nil 179 } 180 181 // Did we hit the size limit? If so, return what we have. 182 if exceededMaxBytes { 183 return ents, nil 184 } 185 186 // Did we get any results at all? Because something went wrong. 187 if len(ents) > 0 { 188 // Was the lo already truncated? 189 if ents[0].Index > lo { 190 return nil, raft.ErrCompacted 191 } 192 193 // Was the missing index after the last index? 194 lastIndex, err := rsl.LoadLastIndex(ctx, reader) 195 if err != nil { 196 return nil, err 197 } 198 if lastIndex <= expectedIndex { 199 return nil, raft.ErrUnavailable 200 } 201 202 // We have a gap in the record, if so, return a nasty error. 203 return nil, errors.Errorf("there is a gap in the index record between lo:%d and hi:%d at index:%d", lo, hi, expectedIndex) 204 } 205 206 // No results, was it due to unavailability or truncation? 207 ts, _, err := rsl.LoadRaftTruncatedState(ctx, reader) 208 if err != nil { 209 return nil, err 210 } 211 if ts.Index >= lo { 212 // The requested lo index has already been truncated. 213 return nil, raft.ErrCompacted 214 } 215 // The requested lo index does not yet exist. 216 return nil, raft.ErrUnavailable 217 } 218 219 func iterateEntries( 220 ctx context.Context, 221 reader storage.Reader, 222 rangeID roachpb.RangeID, 223 lo, hi uint64, 224 scanFunc func(roachpb.KeyValue) (bool, error), 225 ) error { 226 _, err := storage.MVCCIterate( 227 ctx, reader, 228 keys.RaftLogKey(rangeID, lo), 229 keys.RaftLogKey(rangeID, hi), 230 hlc.Timestamp{}, 231 storage.MVCCScanOptions{}, 232 scanFunc, 233 ) 234 return err 235 } 236 237 // invalidLastTerm is an out-of-band value for r.mu.lastTerm that 238 // invalidates lastTerm caching and forces retrieval of Term(lastTerm) 239 // from the raftEntryCache/RocksDB. 240 const invalidLastTerm = 0 241 242 // Term implements the raft.Storage interface. 243 func (r *replicaRaftStorage) Term(i uint64) (uint64, error) { 244 // TODO(nvanbenschoten): should we set r.mu.lastTerm when 245 // r.mu.lastIndex == i && r.mu.lastTerm == invalidLastTerm? 246 if r.mu.lastIndex == i && r.mu.lastTerm != invalidLastTerm { 247 return r.mu.lastTerm, nil 248 } 249 // Try to retrieve the term for the desired entry from the entry cache. 250 if e, ok := r.store.raftEntryCache.Get(r.RangeID, i); ok { 251 return e.Term, nil 252 } 253 readonly := r.store.Engine().NewReadOnly() 254 defer readonly.Close() 255 ctx := r.AnnotateCtx(context.TODO()) 256 return term(ctx, r.mu.stateLoader, readonly, r.RangeID, r.store.raftEntryCache, i) 257 } 258 259 // raftTermLocked requires that r.mu is locked for reading. 260 func (r *Replica) raftTermRLocked(i uint64) (uint64, error) { 261 return (*replicaRaftStorage)(r).Term(i) 262 } 263 264 func term( 265 ctx context.Context, 266 rsl stateloader.StateLoader, 267 reader storage.Reader, 268 rangeID roachpb.RangeID, 269 eCache *raftentry.Cache, 270 i uint64, 271 ) (uint64, error) { 272 // entries() accepts a `nil` sideloaded storage and will skip inlining of 273 // sideloaded entries. We only need the term, so this is what we do. 274 ents, err := entries(ctx, rsl, reader, rangeID, eCache, nil /* sideloaded */, i, i+1, math.MaxUint64 /* maxBytes */) 275 if errors.Is(err, raft.ErrCompacted) { 276 ts, _, err := rsl.LoadRaftTruncatedState(ctx, reader) 277 if err != nil { 278 return 0, err 279 } 280 if i == ts.Index { 281 return ts.Term, nil 282 } 283 return 0, raft.ErrCompacted 284 } else if err != nil { 285 return 0, err 286 } 287 if len(ents) == 0 { 288 return 0, nil 289 } 290 return ents[0].Term, nil 291 } 292 293 // LastIndex implements the raft.Storage interface. 294 func (r *replicaRaftStorage) LastIndex() (uint64, error) { 295 return r.mu.lastIndex, nil 296 } 297 298 // raftLastIndexLocked requires that r.mu is held. 299 func (r *Replica) raftLastIndexLocked() (uint64, error) { 300 return (*replicaRaftStorage)(r).LastIndex() 301 } 302 303 // raftTruncatedStateLocked returns metadata about the log that preceded the 304 // first current entry. This includes both entries that have been compacted away 305 // and the dummy entries that make up the starting point of an empty log. 306 // raftTruncatedStateLocked requires that r.mu is held. 307 func (r *Replica) raftTruncatedStateLocked( 308 ctx context.Context, 309 ) (roachpb.RaftTruncatedState, error) { 310 if r.mu.state.TruncatedState != nil { 311 return *r.mu.state.TruncatedState, nil 312 } 313 ts, _, err := r.mu.stateLoader.LoadRaftTruncatedState(ctx, r.store.Engine()) 314 if err != nil { 315 return ts, err 316 } 317 if ts.Index != 0 { 318 r.mu.state.TruncatedState = &ts 319 } 320 return ts, nil 321 } 322 323 // FirstIndex implements the raft.Storage interface. 324 func (r *replicaRaftStorage) FirstIndex() (uint64, error) { 325 ctx := r.AnnotateCtx(context.TODO()) 326 ts, err := (*Replica)(r).raftTruncatedStateLocked(ctx) 327 if err != nil { 328 return 0, err 329 } 330 return ts.Index + 1, nil 331 } 332 333 // raftFirstIndexLocked requires that r.mu is held. 334 func (r *Replica) raftFirstIndexLocked() (uint64, error) { 335 return (*replicaRaftStorage)(r).FirstIndex() 336 } 337 338 // GetFirstIndex is the same function as raftFirstIndexLocked but it requires 339 // that r.mu is not held. 340 func (r *Replica) GetFirstIndex() (uint64, error) { 341 r.mu.Lock() 342 defer r.mu.Unlock() 343 return r.raftFirstIndexLocked() 344 } 345 346 // GetLeaseAppliedIndex returns the lease index of the last applied command. 347 func (r *Replica) GetLeaseAppliedIndex() uint64 { 348 r.mu.RLock() 349 defer r.mu.RUnlock() 350 return r.mu.state.LeaseAppliedIndex 351 } 352 353 // Snapshot implements the raft.Storage interface. Snapshot requires that 354 // r.mu is held. Note that the returned snapshot is a placeholder and 355 // does not contain any of the replica data. The snapshot is actually generated 356 // (and sent) by the Raft snapshot queue. 357 func (r *replicaRaftStorage) Snapshot() (raftpb.Snapshot, error) { 358 r.mu.AssertHeld() 359 appliedIndex := r.mu.state.RaftAppliedIndex 360 term, err := r.Term(appliedIndex) 361 if err != nil { 362 return raftpb.Snapshot{}, err 363 } 364 return raftpb.Snapshot{ 365 Metadata: raftpb.SnapshotMetadata{ 366 Index: appliedIndex, 367 Term: term, 368 }, 369 }, nil 370 } 371 372 // raftSnapshotLocked requires that r.mu is held. 373 func (r *Replica) raftSnapshotLocked() (raftpb.Snapshot, error) { 374 return (*replicaRaftStorage)(r).Snapshot() 375 } 376 377 // GetSnapshot returns a snapshot of the replica appropriate for sending to a 378 // replica. If this method returns without error, callers must eventually call 379 // OutgoingSnapshot.Close. 380 func (r *Replica) GetSnapshot( 381 ctx context.Context, snapType SnapshotRequest_Type, recipientStore roachpb.StoreID, 382 ) (_ *OutgoingSnapshot, err error) { 383 snapUUID := uuid.MakeV4() 384 // Get a snapshot while holding raftMu to make sure we're not seeing "half 385 // an AddSSTable" (i.e. a state in which an SSTable has been linked in, but 386 // the corresponding Raft command not applied yet). 387 r.raftMu.Lock() 388 snap := r.store.engine.NewSnapshot() 389 r.mu.Lock() 390 appliedIndex := r.mu.state.RaftAppliedIndex 391 // Cleared when OutgoingSnapshot closes. 392 r.addSnapshotLogTruncationConstraintLocked(ctx, snapUUID, appliedIndex, recipientStore) 393 r.mu.Unlock() 394 r.raftMu.Unlock() 395 396 release := func() { 397 now := timeutil.Now() 398 r.completeSnapshotLogTruncationConstraint(ctx, snapUUID, now) 399 } 400 401 defer func() { 402 if err != nil { 403 release() 404 snap.Close() 405 } 406 }() 407 408 r.mu.RLock() 409 defer r.mu.RUnlock() 410 rangeID := r.RangeID 411 412 startKey := r.mu.state.Desc.StartKey 413 ctx, sp := r.AnnotateCtxWithSpan(ctx, "snapshot") 414 defer sp.Finish() 415 416 log.Eventf(ctx, "new engine snapshot for replica %s", r) 417 418 // Delegate to a static function to make sure that we do not depend 419 // on any indirect calls to r.store.Engine() (or other in-memory 420 // state of the Replica). Everything must come from the snapshot. 421 withSideloaded := func(fn func(SideloadStorage) error) error { 422 r.raftMu.Lock() 423 defer r.raftMu.Unlock() 424 return fn(r.raftMu.sideloaded) 425 } 426 // NB: We have Replica.mu read-locked, but we need it write-locked in order 427 // to use Replica.mu.stateLoader. This call is not performance sensitive, so 428 // create a new state loader. 429 snapData, err := snapshot( 430 ctx, snapUUID, stateloader.Make(rangeID), snapType, 431 snap, rangeID, r.store.raftEntryCache, withSideloaded, startKey, 432 ) 433 if err != nil { 434 log.Errorf(ctx, "error generating snapshot: %+v", err) 435 return nil, err 436 } 437 snapData.onClose = release 438 return &snapData, nil 439 } 440 441 // OutgoingSnapshot contains the data required to stream a snapshot to a 442 // recipient. Once one is created, it needs to be closed via Close() to prevent 443 // resource leakage. 444 type OutgoingSnapshot struct { 445 SnapUUID uuid.UUID 446 // The Raft snapshot message to send. Contains SnapUUID as its data. 447 RaftSnap raftpb.Snapshot 448 // The RocksDB snapshot that will be streamed from. 449 EngineSnap storage.Reader 450 // The complete range iterator for the snapshot to stream. 451 Iter *rditer.ReplicaDataIterator 452 // The replica state within the snapshot. 453 State kvserverpb.ReplicaState 454 // Allows access the the original Replica's sideloaded storage. Note that 455 // this isn't a snapshot of the sideloaded storage congruent with EngineSnap 456 // or RaftSnap -- a log truncation could have removed files from the 457 // sideloaded storage in the meantime. 458 WithSideloaded func(func(SideloadStorage) error) error 459 RaftEntryCache *raftentry.Cache 460 snapType SnapshotRequest_Type 461 onClose func() 462 } 463 464 func (s *OutgoingSnapshot) String() string { 465 return fmt.Sprintf("%s snapshot %s at applied index %d", s.snapType, s.SnapUUID.Short(), s.State.RaftAppliedIndex) 466 } 467 468 // Close releases the resources associated with the snapshot. 469 func (s *OutgoingSnapshot) Close() { 470 s.Iter.Close() 471 s.EngineSnap.Close() 472 if s.onClose != nil { 473 s.onClose() 474 } 475 } 476 477 // IncomingSnapshot contains the data for an incoming streaming snapshot message. 478 type IncomingSnapshot struct { 479 SnapUUID uuid.UUID 480 // The storage interface for the underlying SSTs. 481 SSTStorageScratch *SSTSnapshotStorageScratch 482 // The Raft log entries for this snapshot. 483 LogEntries [][]byte 484 // The replica state at the time the snapshot was generated (never nil). 485 State *kvserverpb.ReplicaState 486 // 487 // When true, this snapshot contains an unreplicated TruncatedState. When 488 // false, the TruncatedState is replicated (see the reference below) and the 489 // recipient must avoid also writing the unreplicated TruncatedState. The 490 // migration to an unreplicated TruncatedState will be carried out during 491 // the next log truncation (assuming cluster version is bumped at that 492 // point). 493 // See the comment on VersionUnreplicatedRaftTruncatedState for details. 494 UsesUnreplicatedTruncatedState bool 495 snapType SnapshotRequest_Type 496 } 497 498 func (s *IncomingSnapshot) String() string { 499 return fmt.Sprintf("%s snapshot %s at applied index %d", s.snapType, s.SnapUUID.Short(), s.State.RaftAppliedIndex) 500 } 501 502 // snapshot creates an OutgoingSnapshot containing a rocksdb snapshot for the 503 // given range. Note that snapshot() is called without Replica.raftMu held. 504 func snapshot( 505 ctx context.Context, 506 snapUUID uuid.UUID, 507 rsl stateloader.StateLoader, 508 snapType SnapshotRequest_Type, 509 snap storage.Reader, 510 rangeID roachpb.RangeID, 511 eCache *raftentry.Cache, 512 withSideloaded func(func(SideloadStorage) error) error, 513 startKey roachpb.RKey, 514 ) (OutgoingSnapshot, error) { 515 var desc roachpb.RangeDescriptor 516 // We ignore intents on the range descriptor (consistent=false) because we 517 // know they cannot be committed yet; operations that modify range 518 // descriptors resolve their own intents when they commit. 519 ok, err := storage.MVCCGetProto(ctx, snap, keys.RangeDescriptorKey(startKey), 520 hlc.MaxTimestamp, &desc, storage.MVCCGetOptions{Inconsistent: true}) 521 if err != nil { 522 return OutgoingSnapshot{}, errors.Errorf("failed to get desc: %s", err) 523 } 524 if !ok { 525 return OutgoingSnapshot{}, errors.Errorf("couldn't find range descriptor") 526 } 527 528 // Read the range metadata from the snapshot instead of the members 529 // of the Range struct because they might be changed concurrently. 530 appliedIndex, _, err := rsl.LoadAppliedIndex(ctx, snap) 531 if err != nil { 532 return OutgoingSnapshot{}, err 533 } 534 535 term, err := term(ctx, rsl, snap, rangeID, eCache, appliedIndex) 536 if err != nil { 537 return OutgoingSnapshot{}, errors.Errorf("failed to fetch term of %d: %s", appliedIndex, err) 538 } 539 540 state, err := rsl.Load(ctx, snap, &desc) 541 if err != nil { 542 return OutgoingSnapshot{}, err 543 } 544 545 // Intentionally let this iterator and the snapshot escape so that the 546 // streamer can send chunks from it bit by bit. 547 iter := rditer.NewReplicaDataIterator(&desc, snap, 548 true /* replicatedOnly */, false /* seekEnd */) 549 550 return OutgoingSnapshot{ 551 RaftEntryCache: eCache, 552 WithSideloaded: withSideloaded, 553 EngineSnap: snap, 554 Iter: iter, 555 State: state, 556 SnapUUID: snapUUID, 557 RaftSnap: raftpb.Snapshot{ 558 Data: snapUUID.GetBytes(), 559 Metadata: raftpb.SnapshotMetadata{ 560 Index: appliedIndex, 561 Term: term, 562 // Synthesize our raftpb.ConfState from desc. 563 ConfState: desc.Replicas().ConfState(), 564 }, 565 }, 566 snapType: snapType, 567 }, nil 568 } 569 570 // append the given entries to the raft log. Takes the previous values of 571 // r.mu.lastIndex, r.mu.lastTerm, and r.mu.raftLogSize, and returns new values. 572 // We do this rather than modifying them directly because these modifications 573 // need to be atomic with the commit of the batch. This method requires that 574 // r.raftMu is held. 575 // 576 // append is intentionally oblivious to the existence of sideloaded proposals. 577 // They are managed by the caller, including cleaning up obsolete on-disk 578 // payloads in case the log tail is replaced. 579 // 580 // NOTE: This method takes a engine.Writer because reads are unnecessary when 581 // prevLastIndex is 0 and prevLastTerm is invalidLastTerm. In the case where 582 // reading is necessary (I.E. entries are getting overwritten or deleted), a 583 // engine.ReadWriter must be passed in. 584 func (r *Replica) append( 585 ctx context.Context, 586 writer storage.Writer, 587 prevLastIndex uint64, 588 prevLastTerm uint64, 589 prevRaftLogSize int64, 590 entries []raftpb.Entry, 591 ) (uint64, uint64, int64, error) { 592 if len(entries) == 0 { 593 return prevLastIndex, prevLastTerm, prevRaftLogSize, nil 594 } 595 var diff enginepb.MVCCStats 596 var value roachpb.Value 597 for i := range entries { 598 ent := &entries[i] 599 key := r.raftMu.stateLoader.RaftLogKey(ent.Index) 600 601 if err := value.SetProto(ent); err != nil { 602 return 0, 0, 0, err 603 } 604 value.InitChecksum(key) 605 var err error 606 if ent.Index > prevLastIndex { 607 err = storage.MVCCBlindPut(ctx, writer, &diff, key, hlc.Timestamp{}, value, nil /* txn */) 608 } else { 609 // We type assert `writer` to also be an engine.ReadWriter only in 610 // the case where we're replacing existing entries. 611 eng, ok := writer.(storage.ReadWriter) 612 if !ok { 613 panic("expected writer to be a engine.ReadWriter when overwriting log entries") 614 } 615 err = storage.MVCCPut(ctx, eng, &diff, key, hlc.Timestamp{}, value, nil /* txn */) 616 } 617 if err != nil { 618 return 0, 0, 0, err 619 } 620 } 621 622 lastIndex := entries[len(entries)-1].Index 623 lastTerm := entries[len(entries)-1].Term 624 // Delete any previously appended log entries which never committed. 625 if prevLastIndex > 0 { 626 // We type assert `writer` to also be an engine.ReadWriter only in the 627 // case where we're deleting existing entries. 628 eng, ok := writer.(storage.ReadWriter) 629 if !ok { 630 panic("expected writer to be a engine.ReadWriter when deleting log entries") 631 } 632 for i := lastIndex + 1; i <= prevLastIndex; i++ { 633 // Note that the caller is in charge of deleting any sideloaded payloads 634 // (which they must only do *after* the batch has committed). 635 err := storage.MVCCDelete(ctx, eng, &diff, r.raftMu.stateLoader.RaftLogKey(i), 636 hlc.Timestamp{}, nil /* txn */) 637 if err != nil { 638 return 0, 0, 0, err 639 } 640 } 641 } 642 643 raftLogSize := prevRaftLogSize + diff.SysBytes 644 return lastIndex, lastTerm, raftLogSize, nil 645 } 646 647 // updateRangeInfo is called whenever a range is updated by ApplySnapshot 648 // or is created by range splitting to setup the fields which are 649 // uninitialized or need updating. 650 func (r *Replica) updateRangeInfo(desc *roachpb.RangeDescriptor) error { 651 // RangeMaxBytes should be updated by looking up Zone Config in two cases: 652 // 1. After applying a snapshot, if the zone config was not updated for 653 // this key range, then maxBytes of this range will not be updated either. 654 // 2. After a new range is created by a split, only copying maxBytes from 655 // the original range wont work as the original and new ranges might belong 656 // to different zones. 657 // Load the system config. 658 cfg := r.store.Gossip().GetSystemConfig() 659 if cfg == nil { 660 // This could be before the system config was ever gossiped, 661 // or it expired. Let the gossip callback set the info. 662 ctx := r.AnnotateCtx(context.TODO()) 663 log.Warningf(ctx, "no system config available, cannot determine range MaxBytes") 664 return nil 665 } 666 667 // Find zone config for this range. 668 zone, err := cfg.GetZoneConfigForKey(desc.StartKey) 669 if err != nil { 670 return errors.Errorf("%s: failed to lookup zone config: %s", r, err) 671 } 672 673 r.SetZoneConfig(zone) 674 return nil 675 } 676 677 // clearRangeData clears the data associated with a range descriptor. If 678 // rangeIDLocalOnly is true, then only the range-id local keys are deleted. 679 // Otherwise, the range-id local keys, range local keys, and user keys are all 680 // deleted. If mustClearRange is true, ClearRange will always be used to remove 681 // the keys. Otherwise, ClearRangeWithHeuristic will be used, which chooses 682 // ClearRange or ClearIterRange depending on how many keys there are in the 683 // range. 684 func clearRangeData( 685 desc *roachpb.RangeDescriptor, 686 reader storage.Reader, 687 writer storage.Writer, 688 rangeIDLocalOnly bool, 689 mustClearRange bool, 690 ) error { 691 var keyRanges []rditer.KeyRange 692 if rangeIDLocalOnly { 693 keyRanges = []rditer.KeyRange{rditer.MakeRangeIDLocalKeyRange(desc.RangeID, false)} 694 } else { 695 keyRanges = rditer.MakeAllKeyRanges(desc) 696 } 697 var clearRangeFn func(storage.Reader, storage.Writer, roachpb.Key, roachpb.Key) error 698 if mustClearRange { 699 clearRangeFn = func(reader storage.Reader, writer storage.Writer, start, end roachpb.Key) error { 700 return writer.ClearRange(storage.MakeMVCCMetadataKey(start), storage.MakeMVCCMetadataKey(end)) 701 } 702 } else { 703 clearRangeFn = storage.ClearRangeWithHeuristic 704 } 705 706 for _, keyRange := range keyRanges { 707 if err := clearRangeFn(reader, writer, keyRange.Start.Key, keyRange.End.Key); err != nil { 708 return err 709 } 710 } 711 return nil 712 } 713 714 // applySnapshot updates the replica and its store based on the given snapshot 715 // and associated HardState. All snapshots must pass through Raft for 716 // correctness, i.e. the parameters to this method must be taken from a 717 // raft.Ready. Any replicas specified in subsumedRepls will be destroyed 718 // atomically with the application of the snapshot. 719 // 720 // If there is a placeholder associated with r, applySnapshot will remove that 721 // placeholder from the store if and only if it does not return an error. 722 // 723 // This method requires that r.raftMu is held, as well as the raftMus of any 724 // replicas in subsumedRepls. 725 // 726 // TODO(benesch): the way this replica method reaches into its store to update 727 // replicasByKey is unfortunate, but the fix requires a substantial refactor to 728 // maintain the necessary synchronization. 729 func (r *Replica) applySnapshot( 730 ctx context.Context, 731 inSnap IncomingSnapshot, 732 snap raftpb.Snapshot, 733 hs raftpb.HardState, 734 subsumedRepls []*Replica, 735 ) (err error) { 736 s := *inSnap.State 737 if s.Desc.RangeID != r.RangeID { 738 log.Fatalf(ctx, "unexpected range ID %d", s.Desc.RangeID) 739 } 740 741 snapType := inSnap.snapType 742 defer func() { 743 if err == nil { 744 switch snapType { 745 case SnapshotRequest_RAFT: 746 r.store.metrics.RangeSnapshotsNormalApplied.Inc(1) 747 case SnapshotRequest_LEARNER: 748 r.store.metrics.RangeSnapshotsLearnerApplied.Inc(1) 749 } 750 } 751 }() 752 753 if raft.IsEmptySnap(snap) { 754 // Raft discarded the snapshot, indicating that our local state is 755 // already ahead of what the snapshot provides. But we count it for 756 // stats (see the defer above). 757 // 758 // Since we're not returning an error, we're responsible for removing any 759 // placeholder that might exist. 760 r.store.mu.Lock() 761 if r.store.removePlaceholderLocked(ctx, r.RangeID) { 762 atomic.AddInt32(&r.store.counts.filledPlaceholders, 1) 763 } 764 r.store.mu.Unlock() 765 return nil 766 } 767 if raft.IsEmptyHardState(hs) { 768 // Raft will never provide an empty HardState if it is providing a 769 // nonempty snapshot because we discard snapshots that do not increase 770 // the commit index. 771 log.Fatalf(ctx, "found empty HardState for non-empty Snapshot %+v", snap) 772 } 773 774 var stats struct { 775 // Time to process subsumed replicas. 776 subsumedReplicas time.Time 777 // Time to ingest SSTs. 778 ingestion time.Time 779 } 780 log.Infof(ctx, "applying %s snapshot [id=%s index=%d]", 781 snapType, inSnap.SnapUUID.Short(), snap.Metadata.Index) 782 defer func(start time.Time) { 783 now := timeutil.Now() 784 totalLog := fmt.Sprintf( 785 "total=%0.0fms ", 786 now.Sub(start).Seconds()*1000, 787 ) 788 var subsumedReplicasLog string 789 if len(subsumedRepls) > 0 { 790 subsumedReplicasLog = fmt.Sprintf( 791 "subsumedReplicas=%d@%0.0fms ", 792 len(subsumedRepls), 793 stats.subsumedReplicas.Sub(start).Seconds()*1000, 794 ) 795 } 796 ingestionLog := fmt.Sprintf( 797 "ingestion=%d@%0.0fms ", 798 len(inSnap.SSTStorageScratch.SSTs()), 799 stats.ingestion.Sub(stats.subsumedReplicas).Seconds()*1000, 800 ) 801 log.Infof(ctx, "applied %s snapshot [%s%s%sid=%s index=%d]", 802 snapType, totalLog, subsumedReplicasLog, ingestionLog, 803 inSnap.SnapUUID.Short(), snap.Metadata.Index) 804 }(timeutil.Now()) 805 806 unreplicatedSSTFile := &storage.MemFile{} 807 unreplicatedSST := storage.MakeIngestionSSTWriter(unreplicatedSSTFile) 808 defer unreplicatedSST.Close() 809 810 // Clearing the unreplicated state. 811 unreplicatedPrefixKey := keys.MakeRangeIDUnreplicatedPrefix(r.RangeID) 812 unreplicatedStart := storage.MakeMVCCMetadataKey(unreplicatedPrefixKey) 813 unreplicatedEnd := storage.MakeMVCCMetadataKey(unreplicatedPrefixKey.PrefixEnd()) 814 if err = unreplicatedSST.ClearRange(unreplicatedStart, unreplicatedEnd); err != nil { 815 return errors.Wrapf(err, "error clearing range of unreplicated SST writer") 816 } 817 818 // Update HardState. 819 if err := r.raftMu.stateLoader.SetHardState(ctx, &unreplicatedSST, hs); err != nil { 820 return errors.Wrapf(err, "unable to write HardState to unreplicated SST writer") 821 } 822 823 // Update Raft entries. 824 var lastTerm uint64 825 var raftLogSize int64 826 if len(inSnap.LogEntries) > 0 { 827 logEntries := make([]raftpb.Entry, len(inSnap.LogEntries)) 828 for i, bytes := range inSnap.LogEntries { 829 if err := protoutil.Unmarshal(bytes, &logEntries[i]); err != nil { 830 return err 831 } 832 } 833 var sideloadedEntriesSize int64 834 var err error 835 logEntries, sideloadedEntriesSize, err = r.maybeSideloadEntriesRaftMuLocked(ctx, logEntries) 836 if err != nil { 837 return err 838 } 839 raftLogSize += sideloadedEntriesSize 840 _, lastTerm, raftLogSize, err = r.append(ctx, &unreplicatedSST, 0, invalidLastTerm, raftLogSize, logEntries) 841 if err != nil { 842 return err 843 } 844 } else { 845 lastTerm = invalidLastTerm 846 } 847 r.store.raftEntryCache.Drop(r.RangeID) 848 849 // Update TruncatedState if it is unreplicated. 850 if inSnap.UsesUnreplicatedTruncatedState { 851 if err := r.raftMu.stateLoader.SetRaftTruncatedState( 852 ctx, &unreplicatedSST, s.TruncatedState, 853 ); err != nil { 854 return errors.Wrapf(err, "unable to write UnreplicatedTruncatedState to unreplicated SST writer") 855 } 856 } 857 858 if err := unreplicatedSST.Finish(); err != nil { 859 return err 860 } 861 if unreplicatedSST.DataSize > 0 { 862 // TODO(itsbilal): Write to SST directly in unreplicatedSST rather than 863 // buffering in a MemFile first. 864 if err := inSnap.SSTStorageScratch.WriteSST(ctx, unreplicatedSSTFile.Data()); err != nil { 865 return err 866 } 867 } 868 869 if s.RaftAppliedIndex != snap.Metadata.Index { 870 log.Fatalf(ctx, "snapshot RaftAppliedIndex %d doesn't match its metadata index %d", 871 s.RaftAppliedIndex, snap.Metadata.Index) 872 } 873 874 if expLen := s.RaftAppliedIndex - s.TruncatedState.Index; expLen != uint64(len(inSnap.LogEntries)) { 875 entriesRange, err := extractRangeFromEntries(inSnap.LogEntries) 876 if err != nil { 877 return err 878 } 879 880 tag := fmt.Sprintf("r%d_%s", r.RangeID, inSnap.SnapUUID.String()) 881 dir, err := r.store.checkpoint(ctx, tag) 882 if err != nil { 883 log.Warningf(ctx, "unable to create checkpoint %s: %+v", dir, err) 884 } else { 885 log.Warningf(ctx, "created checkpoint %s", dir) 886 } 887 888 log.Fatalf(ctx, "missing log entries in snapshot (%s): got %d entries, expected %d "+ 889 "(TruncatedState.Index=%d, HardState=%s, LogEntries=%s)", 890 inSnap.String(), len(inSnap.LogEntries), expLen, s.TruncatedState.Index, 891 hs.String(), entriesRange) 892 } 893 894 // If we're subsuming a replica below, we don't have its last NextReplicaID, 895 // nor can we obtain it. That's OK: we can just be conservative and use the 896 // maximum possible replica ID. preDestroyRaftMuLocked will write a replica 897 // tombstone using this maximum possible replica ID, which would normally be 898 // problematic, as it would prevent this store from ever having a new replica 899 // of the removed range. In this case, however, it's copacetic, as subsumed 900 // ranges _can't_ have new replicas. 901 if err := r.clearSubsumedReplicaDiskData(ctx, inSnap.SSTStorageScratch, s.Desc, subsumedRepls, mergedTombstoneReplicaID); err != nil { 902 return err 903 } 904 stats.subsumedReplicas = timeutil.Now() 905 906 // Ingest all SSTs atomically. 907 if fn := r.store.cfg.TestingKnobs.BeforeSnapshotSSTIngestion; fn != nil { 908 if err := fn(inSnap, snapType, inSnap.SSTStorageScratch.SSTs()); err != nil { 909 return err 910 } 911 } 912 if err := r.store.engine.IngestExternalFiles(ctx, inSnap.SSTStorageScratch.SSTs()); err != nil { 913 return errors.Wrapf(err, "while ingesting %s", inSnap.SSTStorageScratch.SSTs()) 914 } 915 stats.ingestion = timeutil.Now() 916 917 // The on-disk state is now committed, but the corresponding in-memory state 918 // has not yet been updated. Any errors past this point must therefore be 919 // treated as fatal. 920 921 if err := r.clearSubsumedReplicaInMemoryData(ctx, subsumedRepls, mergedTombstoneReplicaID); err != nil { 922 log.Fatalf(ctx, "failed to clear in-memory data of subsumed replicas while applying snapshot: %+v", err) 923 } 924 925 // Atomically swap the placeholder, if any, for the replica, and update the 926 // replica's descriptor. 927 r.store.mu.Lock() 928 if r.store.removePlaceholderLocked(ctx, r.RangeID) { 929 atomic.AddInt32(&r.store.counts.filledPlaceholders, 1) 930 } 931 r.setDescRaftMuLocked(ctx, s.Desc) 932 if err := r.store.maybeMarkReplicaInitializedLocked(ctx, r); err != nil { 933 log.Fatalf(ctx, "unable to mark replica initialized while applying snapshot: %+v", err) 934 } 935 r.store.mu.Unlock() 936 937 // Invoke the leasePostApply method to ensure we properly initialize the 938 // replica according to whether it holds the lease. We allow jumps in the 939 // lease sequence because there may be multiple lease changes accounted for 940 // in the snapshot. 941 r.leasePostApply(ctx, *s.Lease, true /* permitJump */) 942 943 // Inform the concurrency manager that this replica just applied a snapshot. 944 r.concMgr.OnReplicaSnapshotApplied() 945 946 r.mu.Lock() 947 // We set the persisted last index to the last applied index. This is 948 // not a correctness issue, but means that we may have just transferred 949 // some entries we're about to re-request from the leader and overwrite. 950 // However, raft.MultiNode currently expects this behavior, and the 951 // performance implications are not likely to be drastic. If our 952 // feelings about this ever change, we can add a LastIndex field to 953 // raftpb.SnapshotMetadata. 954 r.mu.lastIndex = s.RaftAppliedIndex 955 r.mu.lastTerm = lastTerm 956 r.mu.raftLogSize = raftLogSize 957 // Update the store stats for the data in the snapshot. 958 r.store.metrics.subtractMVCCStats(*r.mu.state.Stats) 959 r.store.metrics.addMVCCStats(*s.Stats) 960 // Update the rest of the Raft state. Changes to r.mu.state.Desc must be 961 // managed by r.setDescRaftMuLocked and changes to r.mu.state.Lease must be handled 962 // by r.leasePostApply, but we called those above, so now it's safe to 963 // wholesale replace r.mu.state. 964 r.mu.state = s 965 // Snapshots typically have fewer log entries than the leaseholder. The next 966 // time we hold the lease, recompute the log size before making decisions. 967 r.mu.raftLogSizeTrusted = false 968 r.assertStateLocked(ctx, r.store.Engine()) 969 r.mu.Unlock() 970 971 // The rangefeed processor is listening for the logical ops attached to 972 // each raft command. These will be lost during a snapshot, so disconnect 973 // the rangefeed, if one exists. 974 r.disconnectRangefeedWithReason( 975 roachpb.RangeFeedRetryError_REASON_RAFT_SNAPSHOT, 976 ) 977 978 // Update the replica's cached byte thresholds. This is a no-op if the system 979 // config is not available, in which case we rely on the next gossip update 980 // to perform the update. 981 if err := r.updateRangeInfo(s.Desc); err != nil { 982 log.Fatalf(ctx, "unable to update range info while applying snapshot: %+v", err) 983 } 984 985 return nil 986 } 987 988 // clearSubsumedReplicaDiskData clears the on disk data of the subsumed 989 // replicas by creating SSTs with range deletion tombstones. We have to be 990 // careful here not to have overlapping ranges with the SSTs we have already 991 // created since that will throw an error while we are ingesting them. This 992 // method requires that each of the subsumed replicas raftMu is held. 993 func (r *Replica) clearSubsumedReplicaDiskData( 994 ctx context.Context, 995 scratch *SSTSnapshotStorageScratch, 996 desc *roachpb.RangeDescriptor, 997 subsumedRepls []*Replica, 998 subsumedNextReplicaID roachpb.ReplicaID, 999 ) error { 1000 getKeyRanges := func(desc *roachpb.RangeDescriptor) [2]rditer.KeyRange { 1001 return [...]rditer.KeyRange{ 1002 rditer.MakeRangeLocalKeyRange(desc), 1003 rditer.MakeUserKeyRange(desc), 1004 } 1005 } 1006 keyRanges := getKeyRanges(desc) 1007 totalKeyRanges := append([]rditer.KeyRange(nil), keyRanges[:]...) 1008 for _, sr := range subsumedRepls { 1009 // We have to create an SST for the subsumed replica's range-id local keys. 1010 subsumedReplSSTFile := &storage.MemFile{} 1011 subsumedReplSST := storage.MakeIngestionSSTWriter(subsumedReplSSTFile) 1012 defer subsumedReplSST.Close() 1013 // NOTE: We set mustClearRange to true because we are setting 1014 // RangeTombstoneKey. Since Clears and Puts need to be done in increasing 1015 // order of keys, it is not safe to use ClearRangeIter. 1016 if err := sr.preDestroyRaftMuLocked( 1017 ctx, 1018 r.store.Engine(), 1019 &subsumedReplSST, 1020 subsumedNextReplicaID, 1021 true, /* clearRangeIDLocalOnly */ 1022 true, /* mustClearRange */ 1023 ); err != nil { 1024 subsumedReplSST.Close() 1025 return err 1026 } 1027 if err := subsumedReplSST.Finish(); err != nil { 1028 return err 1029 } 1030 if subsumedReplSST.DataSize > 0 { 1031 // TODO(itsbilal): Write to SST directly in subsumedReplSST rather than 1032 // buffering in a MemFile first. 1033 if err := scratch.WriteSST(ctx, subsumedReplSSTFile.Data()); err != nil { 1034 return err 1035 } 1036 } 1037 1038 srKeyRanges := getKeyRanges(sr.Desc()) 1039 // Compute the total key space covered by the current replica and all 1040 // subsumed replicas. 1041 for i := range srKeyRanges { 1042 if srKeyRanges[i].Start.Key.Compare(totalKeyRanges[i].Start.Key) < 0 { 1043 totalKeyRanges[i].Start = srKeyRanges[i].Start 1044 } 1045 if srKeyRanges[i].End.Key.Compare(totalKeyRanges[i].End.Key) > 0 { 1046 totalKeyRanges[i].End = srKeyRanges[i].End 1047 } 1048 } 1049 } 1050 1051 // We might have to create SSTs for the range local keys and user keys 1052 // depending on if the subsumed replicas are not fully contained by the 1053 // replica in our snapshot. The following is an example to this case 1054 // happening. 1055 // 1056 // a b c d 1057 // |---1---|-------2-------| S1 1058 // |---1-------------------| S2 1059 // |---1-----------|---3---| S3 1060 // 1061 // Since the merge is the first operation to happen, a follower could be down 1062 // before it completes. It is reasonable for a snapshot for r1 from S3 to 1063 // subsume both r1 and r2 in S1. 1064 for i := range keyRanges { 1065 if totalKeyRanges[i].End.Key.Compare(keyRanges[i].End.Key) > 0 { 1066 subsumedReplSSTFile := &storage.MemFile{} 1067 subsumedReplSST := storage.MakeIngestionSSTWriter(subsumedReplSSTFile) 1068 defer subsumedReplSST.Close() 1069 if err := storage.ClearRangeWithHeuristic( 1070 r.store.Engine(), 1071 &subsumedReplSST, 1072 keyRanges[i].End.Key, 1073 totalKeyRanges[i].End.Key, 1074 ); err != nil { 1075 subsumedReplSST.Close() 1076 return err 1077 } 1078 if err := subsumedReplSST.Finish(); err != nil { 1079 return err 1080 } 1081 if subsumedReplSST.DataSize > 0 { 1082 // TODO(itsbilal): Write to SST directly in subsumedReplSST rather than 1083 // buffering in a MemFile first. 1084 if err := scratch.WriteSST(ctx, subsumedReplSSTFile.Data()); err != nil { 1085 return err 1086 } 1087 } 1088 } 1089 // The snapshot must never subsume a replica that extends the range of the 1090 // replica to the left. This is because splits and merges (the only 1091 // operation that change the key bounds) always leave the start key intact. 1092 // Extending to the left implies that either we merged "to the left" (we 1093 // don't), or that we're applying a snapshot for another range (we don't do 1094 // that either). Something is severely wrong for this to happen. 1095 if totalKeyRanges[i].Start.Key.Compare(keyRanges[i].Start.Key) < 0 { 1096 log.Fatalf(ctx, "subsuming replica to our left; key range: %v; total key range %v", 1097 keyRanges[i], totalKeyRanges[i]) 1098 } 1099 } 1100 return nil 1101 } 1102 1103 // clearSubsumedReplicaInMemoryData clears the in-memory data of the subsumed 1104 // replicas. This method requires that each of the subsumed replicas raftMu is 1105 // held. 1106 func (r *Replica) clearSubsumedReplicaInMemoryData( 1107 ctx context.Context, subsumedRepls []*Replica, subsumedNextReplicaID roachpb.ReplicaID, 1108 ) error { 1109 for _, sr := range subsumedRepls { 1110 // We removed sr's data when we committed the batch. Finish subsumption by 1111 // updating the in-memory bookkeping. 1112 if err := sr.postDestroyRaftMuLocked(ctx, sr.GetMVCCStats()); err != nil { 1113 return err 1114 } 1115 // We already hold sr's raftMu, so we must call removeReplicaImpl directly. 1116 // Note that it's safe to update the store's metadata for sr's removal 1117 // separately from updating the store's metadata for r's new descriptor 1118 // (i.e., under a different store.mu acquisition). Each store.mu 1119 // acquisition leaves the store in a consistent state, and access to the 1120 // replicas themselves is protected by their raftMus, which are held from 1121 // start to finish. 1122 if err := r.store.removeInitializedReplicaRaftMuLocked(ctx, sr, subsumedNextReplicaID, RemoveOptions{ 1123 DestroyData: false, // data is already destroyed 1124 }); err != nil { 1125 return err 1126 } 1127 } 1128 return nil 1129 } 1130 1131 // extractRangeFromEntries returns a string representation of the range of 1132 // marshaled list of raft log entries in the form of [first-index, last-index]. 1133 // If the list is empty, "[n/a, n/a]" is returned instead. 1134 func extractRangeFromEntries(logEntries [][]byte) (string, error) { 1135 var firstIndex, lastIndex string 1136 if len(logEntries) == 0 { 1137 firstIndex = "n/a" 1138 lastIndex = "n/a" 1139 } else { 1140 firstAndLastLogEntries := make([]raftpb.Entry, 2) 1141 if err := protoutil.Unmarshal(logEntries[0], &firstAndLastLogEntries[0]); err != nil { 1142 return "", err 1143 } 1144 if err := protoutil.Unmarshal(logEntries[len(logEntries)-1], &firstAndLastLogEntries[1]); err != nil { 1145 return "", err 1146 } 1147 1148 firstIndex = string(firstAndLastLogEntries[0].Index) 1149 lastIndex = string(firstAndLastLogEntries[1].Index) 1150 } 1151 return fmt.Sprintf("[%s, %s]", firstIndex, lastIndex), nil 1152 } 1153 1154 type raftCommandEncodingVersion byte 1155 1156 // Raft commands are encoded with a 1-byte version (currently 0 or 1), an 8-byte 1157 // ID, followed by the payload. This inflexible encoding is used so we can 1158 // efficiently parse the command id while processing the logs. 1159 // 1160 // TODO(bdarnell): is this commandID still appropriate for our needs? 1161 const ( 1162 // The initial Raft command version, used for all regular Raft traffic. 1163 raftVersionStandard raftCommandEncodingVersion = 0 1164 // A proposal containing an SSTable which preferably should be sideloaded 1165 // (i.e. not stored in the Raft log wholesale). Can be treated as a regular 1166 // proposal when arriving on the wire, but when retrieved from the local 1167 // Raft log it necessary to inline the payload first as it has usually 1168 // been sideloaded. 1169 raftVersionSideloaded raftCommandEncodingVersion = 1 1170 // The prescribed length for each command ID. 1171 raftCommandIDLen = 8 1172 // The prescribed length of each encoded command's prefix. 1173 raftCommandPrefixLen = 1 + raftCommandIDLen 1174 // The no-split bit is now unused, but we still apply the mask to the first 1175 // byte of the command for backward compatibility. 1176 // 1177 // TODO(tschottdorf): predates v1.0 by a significant margin. Remove. 1178 raftCommandNoSplitBit = 1 << 7 1179 raftCommandNoSplitMask = raftCommandNoSplitBit - 1 1180 ) 1181 1182 func encodeRaftCommand( 1183 version raftCommandEncodingVersion, commandID kvserverbase.CmdIDKey, command []byte, 1184 ) []byte { 1185 b := make([]byte, raftCommandPrefixLen+len(command)) 1186 encodeRaftCommandPrefix(b[:raftCommandPrefixLen], version, commandID) 1187 copy(b[raftCommandPrefixLen:], command) 1188 return b 1189 } 1190 1191 func encodeRaftCommandPrefix( 1192 b []byte, version raftCommandEncodingVersion, commandID kvserverbase.CmdIDKey, 1193 ) { 1194 if len(commandID) != raftCommandIDLen { 1195 panic(fmt.Sprintf("invalid command ID length; %d != %d", len(commandID), raftCommandIDLen)) 1196 } 1197 if len(b) != raftCommandPrefixLen { 1198 panic(fmt.Sprintf("invalid command prefix length; %d != %d", len(b), raftCommandPrefixLen)) 1199 } 1200 b[0] = byte(version) 1201 copy(b[1:], []byte(commandID)) 1202 } 1203 1204 // DecodeRaftCommand splits a raftpb.Entry.Data into its commandID and 1205 // command portions. The caller is responsible for checking that the data 1206 // is not empty (which indicates a dummy entry generated by raft rather 1207 // than a real command). Usage is mostly internal to the storage package 1208 // but is exported for use by debugging tools. 1209 func DecodeRaftCommand(data []byte) (kvserverbase.CmdIDKey, []byte) { 1210 v := raftCommandEncodingVersion(data[0] & raftCommandNoSplitMask) 1211 if v != raftVersionStandard && v != raftVersionSideloaded { 1212 panic(fmt.Sprintf("unknown command encoding version %v", data[0])) 1213 } 1214 return kvserverbase.CmdIDKey(data[1 : 1+raftCommandIDLen]), data[1+raftCommandIDLen:] 1215 }