github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica_proposal.go (about) 1 // Copyright 2016 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver 12 13 import ( 14 "context" 15 "fmt" 16 "io/ioutil" 17 "os" 18 "path/filepath" 19 "strings" 20 "time" 21 "unsafe" 22 23 "github.com/cockroachdb/cockroach/pkg/base" 24 "github.com/cockroachdb/cockroach/pkg/clusterversion" 25 "github.com/cockroachdb/cockroach/pkg/keys" 26 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval" 27 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval/result" 28 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase" 29 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb" 30 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/spanset" 31 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/stateloader" 32 "github.com/cockroachdb/cockroach/pkg/roachpb" 33 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 34 "github.com/cockroachdb/cockroach/pkg/storage" 35 "github.com/cockroachdb/cockroach/pkg/storage/enginepb" 36 "github.com/cockroachdb/cockroach/pkg/util" 37 "github.com/cockroachdb/cockroach/pkg/util/hlc" 38 "github.com/cockroachdb/cockroach/pkg/util/log" 39 "github.com/cockroachdb/cockroach/pkg/util/quotapool" 40 "github.com/cockroachdb/cockroach/pkg/util/sysutil" 41 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 42 "github.com/cockroachdb/cockroach/pkg/util/tracing" 43 "github.com/cockroachdb/errors" 44 "github.com/kr/pretty" 45 opentracing "github.com/opentracing/opentracing-go" 46 "golang.org/x/time/rate" 47 ) 48 49 // ProposalData is data about a command which allows it to be 50 // evaluated, proposed to raft, and for the result of the command to 51 // be returned to the caller. 52 type ProposalData struct { 53 // The caller's context, used for logging proposals, reproposals, message 54 // sends, and command application. In order to enable safely tracing events 55 // beneath, modifying this ctx field in *ProposalData requires holding the 56 // raftMu. 57 ctx context.Context 58 59 // An optional tracing span bound to the proposal. Will be cleaned 60 // up when the proposal finishes. 61 sp opentracing.Span 62 63 // idKey uniquely identifies this proposal. 64 // TODO(andreimatei): idKey is legacy at this point: We could easily key 65 // commands by their MaxLeaseIndex, and doing so should be ok with a stop- 66 // the-world migration. However, various test facilities depend on the 67 // command ID for e.g. replay protection. 68 idKey kvserverbase.CmdIDKey 69 70 // proposedAtTicks is the (logical) time at which this command was 71 // last (re-)proposed. 72 proposedAtTicks int 73 74 // command is serialized and proposed to raft. In the event of 75 // reproposals its MaxLeaseIndex field is mutated. 76 command *kvserverpb.RaftCommand 77 78 // encodedCommand is the encoded Raft command, with an optional prefix 79 // containing the command ID. 80 encodedCommand []byte 81 82 // quotaAlloc is the allocation retrieved from the proposalQuota. Once a 83 // proposal has been passed to raft modifying this field requires holding the 84 // raftMu. Once the proposal comes out of Raft, ownerwhip of this quota is 85 // passed to r.mu.quotaReleaseQueue. 86 quotaAlloc *quotapool.IntAlloc 87 88 // tmpFooter is used to avoid an allocation. 89 tmpFooter kvserverpb.RaftCommandFooter 90 91 // ec.done is called after command application to update the timestamp 92 // cache and optionally release latches and exits lock wait-queues. 93 ec endCmds 94 95 // applied is set when the a command finishes application. It is used to 96 // avoid reproposing a failed proposal if an earlier version of the same 97 // proposal succeeded in applying. 98 applied bool 99 100 // doneCh is used to signal the waiting RPC handler (the contents of 101 // proposalResult come from LocalEvalResult). 102 // 103 // Attention: this channel is not to be signaled directly downstream of Raft. 104 // Always use ProposalData.finishApplication(). 105 doneCh chan proposalResult 106 107 // Local contains the results of evaluating the request tying the upstream 108 // evaluation of the request to the downstream application of the command. 109 // Nil when the proposal came from another node (i.e. the evaluation wasn't 110 // done here). 111 Local *result.LocalResult 112 113 // Request is the client's original BatchRequest. 114 // TODO(tschottdorf): tests which use TestingCommandFilter use this. 115 // Decide how that will work in the future, presumably the 116 // CommandFilter would run at proposal time or we allow an opaque 117 // struct to be attached to a proposal which is then available as it 118 // applies. Other than tests, we only need a few bits of the request 119 // here; this could be replaced with isLease and isChangeReplicas 120 // booleans. 121 Request *roachpb.BatchRequest 122 } 123 124 // finishApplication is called when a command application has finished. The 125 // method will be called downstream of Raft if the command required consensus, 126 // but can be called upstream of Raft if the command did not and was never 127 // proposed. 128 // 129 // It first invokes the endCmds function and then sends the specified 130 // proposalResult on the proposal's done channel. endCmds is invoked here in 131 // order to allow the original client to be canceled. (When the original client 132 // is canceled, it won't be listening to this done channel, and so it can't be 133 // counted on to invoke endCmds itself.) 134 // 135 // The method is safe to call more than once, but only the first result will be 136 // returned to the client. 137 func (proposal *ProposalData) finishApplication(ctx context.Context, pr proposalResult) { 138 proposal.ec.done(ctx, proposal.Request, pr.Reply, pr.Err) 139 proposal.signalProposalResult(pr) 140 if proposal.sp != nil { 141 tracing.FinishSpan(proposal.sp) 142 proposal.sp = nil 143 } 144 } 145 146 // returnProposalResult signals proposal.doneCh with the proposal result if it 147 // has not already been signaled. The method can be called even before the 148 // proposal has finished replication and command application, and does not 149 // release the request's latches. 150 // 151 // The method is safe to call more than once, but only the first result will be 152 // returned to the client. 153 func (proposal *ProposalData) signalProposalResult(pr proposalResult) { 154 if proposal.doneCh != nil { 155 proposal.doneCh <- pr 156 proposal.doneCh = nil 157 } 158 } 159 160 // releaseQuota releases the proposal's quotaAlloc and sets it to nil. 161 // If the quotaAlloc is already nil it is a no-op. 162 func (proposal *ProposalData) releaseQuota() { 163 if proposal.quotaAlloc != nil { 164 proposal.quotaAlloc.Release() 165 proposal.quotaAlloc = nil 166 } 167 } 168 169 // TODO(tschottdorf): we should find new homes for the checksum, lease 170 // code, and various others below to leave here only the core logic. 171 // Not moving anything right now to avoid awkward diffs. These should 172 // all be moved to replica_application_result.go. 173 174 func (r *Replica) gcOldChecksumEntriesLocked(now time.Time) { 175 for id, val := range r.mu.checksums { 176 // The timestamp is valid only if set. 177 if !val.gcTimestamp.IsZero() && now.After(val.gcTimestamp) { 178 delete(r.mu.checksums, id) 179 } 180 } 181 } 182 183 func (r *Replica) computeChecksumPostApply(ctx context.Context, cc kvserverpb.ComputeChecksum) { 184 stopper := r.store.Stopper() 185 now := timeutil.Now() 186 r.mu.Lock() 187 var notify chan struct{} 188 if c, ok := r.mu.checksums[cc.ChecksumID]; !ok { 189 // There is no record of this ID. Make a new notification. 190 notify = make(chan struct{}) 191 } else if !c.started { 192 // A CollectChecksumRequest is waiting on the existing notification. 193 notify = c.notify 194 } else { 195 log.Fatalf(ctx, "attempted to apply ComputeChecksum command with duplicated checksum ID %s", 196 cc.ChecksumID) 197 } 198 199 r.gcOldChecksumEntriesLocked(now) 200 201 // Create an entry with checksum == nil and gcTimestamp unset. 202 r.mu.checksums[cc.ChecksumID] = ReplicaChecksum{started: true, notify: notify} 203 desc := *r.mu.state.Desc 204 r.mu.Unlock() 205 206 if cc.Version != batcheval.ReplicaChecksumVersion { 207 r.computeChecksumDone(ctx, cc.ChecksumID, nil, nil) 208 log.Infof(ctx, "incompatible ComputeChecksum versions (requested: %d, have: %d)", 209 cc.Version, batcheval.ReplicaChecksumVersion) 210 return 211 } 212 213 // Caller is holding raftMu, so an engine snapshot is automatically 214 // Raft-consistent (i.e. not in the middle of an AddSSTable). 215 snap := r.store.engine.NewSnapshot() 216 if cc.Checkpoint { 217 sl := stateloader.Make(r.RangeID) 218 rai, _, err := sl.LoadAppliedIndex(ctx, snap) 219 if err != nil { 220 log.Warningf(ctx, "unable to load applied index, continuing anyway") 221 } 222 // NB: the names here will match on all nodes, which is nice for debugging. 223 tag := fmt.Sprintf("r%d_at_%d", r.RangeID, rai) 224 if dir, err := r.store.checkpoint(ctx, tag); err != nil { 225 log.Warningf(ctx, "unable to create checkpoint %s: %+v", dir, err) 226 } else { 227 log.Warningf(ctx, "created checkpoint %s", dir) 228 } 229 } 230 231 // Compute SHA asynchronously and store it in a map by UUID. 232 if err := stopper.RunAsyncTask(ctx, "storage.Replica: computing checksum", func(ctx context.Context) { 233 func() { 234 defer snap.Close() 235 var snapshot *roachpb.RaftSnapshotData 236 if cc.SaveSnapshot { 237 snapshot = &roachpb.RaftSnapshotData{} 238 } 239 result, err := r.sha512(ctx, desc, snap, snapshot, cc.Mode) 240 if err != nil { 241 log.Errorf(ctx, "%v", err) 242 result = nil 243 } 244 r.computeChecksumDone(ctx, cc.ChecksumID, result, snapshot) 245 }() 246 247 var shouldFatal bool 248 for _, rDesc := range cc.Terminate { 249 if rDesc.StoreID == r.store.StoreID() && rDesc.ReplicaID == r.mu.replicaID { 250 shouldFatal = true 251 } 252 } 253 254 if shouldFatal { 255 // This node should fatal as a result of a previous consistency 256 // check (i.e. this round is carried out only to obtain a diff). 257 // If we fatal too early, the diff won't make it back to the lease- 258 // holder and thus won't be printed to the logs. Since we're already 259 // in a goroutine that's about to end, simply sleep for a few seconds 260 // and then terminate. 261 auxDir := r.store.engine.GetAuxiliaryDir() 262 _ = os.MkdirAll(auxDir, 0755) 263 path := base.PreventedStartupFile(auxDir) 264 265 preventStartupMsg := fmt.Sprintf(`ATTENTION: 266 267 this node is terminating because a replica inconsistency was detected between %s 268 and its other replicas. Please check your cluster-wide log files for more 269 information and contact the CockroachDB support team. It is not necessarily safe 270 to replace this node; cluster data may still be at risk of corruption. 271 272 A checkpoints directory to aid (expert) debugging should be present in: 273 %s 274 275 A file preventing this node from restarting was placed at: 276 %s 277 `, r, auxDir, path) 278 279 if err := ioutil.WriteFile(path, []byte(preventStartupMsg), 0644); err != nil { 280 log.Warningf(ctx, "%v", err) 281 } 282 283 if p := r.store.cfg.TestingKnobs.ConsistencyTestingKnobs.OnBadChecksumFatal; p != nil { 284 p(*r.store.Ident) 285 } else { 286 time.Sleep(10 * time.Second) 287 log.Fatalf(r.AnnotateCtx(context.Background()), preventStartupMsg) 288 } 289 } 290 291 }); err != nil { 292 defer snap.Close() 293 log.Errorf(ctx, "could not run async checksum computation (ID = %s): %v", cc.ChecksumID, err) 294 // Set checksum to nil. 295 r.computeChecksumDone(ctx, cc.ChecksumID, nil, nil) 296 } 297 } 298 299 // leasePostApply updates the Replica's internal state to reflect the 300 // application of a new Range lease. The method is idempotent, so it can be 301 // called repeatedly for the same lease safely. However, the method will panic 302 // if passed a lease with a lower sequence number than the current lease. By 303 // default, the method will also panic if passed a lease that indicates a 304 // forward sequence number jump (i.e. a skipped lease). This behavior can 305 // be disabled by passing permitJump as true. 306 func (r *Replica) leasePostApply(ctx context.Context, newLease roachpb.Lease, permitJump bool) { 307 r.mu.Lock() 308 replicaID := r.mu.replicaID 309 // Pull out the last lease known to this Replica. It's possible that this is 310 // not actually the last lease in the Range's lease sequence because the 311 // Replica may have missed the application of a lease between prevLease and 312 // newLease. However, this should only be possible if a snapshot includes a 313 // lease update. All other forms of lease updates should be continuous 314 // without jumps (see permitJump). 315 prevLease := *r.mu.state.Lease 316 r.mu.Unlock() 317 318 iAmTheLeaseHolder := newLease.Replica.ReplicaID == replicaID 319 // NB: in the case in which a node restarts, minLeaseProposedTS forces it to 320 // get a new lease and we make sure it gets a new sequence number, thus 321 // causing the right half of the disjunction to fire so that we update the 322 // timestamp cache. 323 leaseChangingHands := prevLease.Replica.StoreID != newLease.Replica.StoreID || prevLease.Sequence != newLease.Sequence 324 325 if iAmTheLeaseHolder { 326 // Log lease acquisition whenever an Epoch-based lease changes hands (or verbose 327 // logging is enabled). 328 if newLease.Type() == roachpb.LeaseEpoch && leaseChangingHands || log.V(1) { 329 log.VEventf(ctx, 1, "new range lease %s following %s", newLease, prevLease) 330 } 331 } 332 333 if leaseChangingHands && iAmTheLeaseHolder { 334 // When taking over the lease, we need to check whether a merge is in 335 // progress, as only the old leaseholder would have been explicitly notified 336 // of the merge. If there is a merge in progress, maybeWatchForMerge will 337 // arrange to block all traffic to this replica unless the merge aborts. 338 if err := r.maybeWatchForMerge(ctx); err != nil { 339 // We were unable to determine whether a merge was in progress. We cannot 340 // safely proceed. 341 log.Fatalf(ctx, "failed checking for in-progress merge while installing new lease %s: %s", 342 newLease, err) 343 } 344 345 // If this replica is a new holder of the lease, update the low water 346 // mark of the timestamp cache. Note that clock offset scenarios are 347 // handled via a stasis period inherent in the lease which is documented 348 // in the Lease struct. 349 // 350 // The introduction of lease transfers implies that the previous lease 351 // may have been shortened and we are now applying a formally overlapping 352 // lease (since the old lease holder has promised not to serve any more 353 // requests, this is kosher). This means that we don't use the old 354 // lease's expiration but instead use the new lease's start to initialize 355 // the timestamp cache low water. 356 setTimestampCacheLowWaterMark(r.store.tsCache, r.Desc(), newLease.Start) 357 358 // Reset the request counts used to make lease placement decisions whenever 359 // starting a new lease. 360 if r.leaseholderStats != nil { 361 r.leaseholderStats.resetRequestCounts() 362 } 363 } 364 365 // Sanity check to make sure that the lease sequence is moving in the right 366 // direction. 367 if s1, s2 := prevLease.Sequence, newLease.Sequence; s1 != 0 { 368 // We're at a version that supports lease sequence numbers. 369 switch { 370 case s2 < s1: 371 log.Fatalf(ctx, "lease sequence inversion, prevLease=%s, newLease=%s", 372 log.Safe(prevLease), log.Safe(newLease)) 373 case s2 == s1: 374 // If the sequence numbers are the same, make sure they're actually 375 // the same lease. This can happen when callers are using 376 // leasePostApply for some of its side effects, like with 377 // splitPostApply. It can also happen during lease extensions. 378 if !prevLease.Equivalent(newLease) { 379 log.Fatalf(ctx, "sequence identical for different leases, prevLease=%s, newLease=%s", 380 log.Safe(prevLease), log.Safe(newLease)) 381 } 382 case s2 == s1+1: 383 // Lease sequence incremented by 1. Expected case. 384 case s2 > s1+1 && !permitJump: 385 log.Fatalf(ctx, "lease sequence jump, prevLease=%s, newLease=%s", 386 log.Safe(prevLease), log.Safe(newLease)) 387 } 388 } 389 390 // Ordering is critical here. We only install the new lease after we've 391 // checked for an in-progress merge and updated the timestamp cache. If the 392 // ordering were reversed, it would be possible for requests to see the new 393 // lease but not the updated merge or timestamp cache state, which can result 394 // in serializability violations. 395 r.mu.Lock() 396 r.mu.state.Lease = &newLease 397 expirationBasedLease := r.requiresExpiringLeaseRLocked() 398 r.mu.Unlock() 399 400 // Gossip the first range whenever its lease is acquired. We check to make 401 // sure the lease is active so that a trailing replica won't process an old 402 // lease request and attempt to gossip the first range. 403 if leaseChangingHands && iAmTheLeaseHolder && r.IsFirstRange() && r.IsLeaseValid(newLease, r.store.Clock().Now()) { 404 r.gossipFirstRange(ctx) 405 } 406 407 // Whenever we first acquire an expiration-based lease, notify the lease 408 // renewer worker that we want it to keep proactively renewing the lease 409 // before it expires. 410 if leaseChangingHands && iAmTheLeaseHolder && expirationBasedLease && r.IsLeaseValid(newLease, r.store.Clock().Now()) { 411 r.store.renewableLeases.Store(int64(r.RangeID), unsafe.Pointer(r)) 412 select { 413 case r.store.renewableLeasesSignal <- struct{}{}: 414 default: 415 } 416 } 417 418 // If we're the current raft leader, may want to transfer the leadership to 419 // the new leaseholder. Note that this condition is also checked periodically 420 // when ticking the replica. 421 r.maybeTransferRaftLeadership(ctx) 422 423 // Notify the store that a lease change occurred and it may need to 424 // gossip the updated store descriptor (with updated capacity). 425 prevOwner := prevLease.OwnedBy(r.store.StoreID()) 426 currentOwner := newLease.OwnedBy(r.store.StoreID()) 427 if leaseChangingHands && (prevOwner || currentOwner) { 428 if currentOwner { 429 r.store.maybeGossipOnCapacityChange(ctx, leaseAddEvent) 430 } else if prevOwner { 431 r.store.maybeGossipOnCapacityChange(ctx, leaseRemoveEvent) 432 } 433 if r.leaseholderStats != nil { 434 r.leaseholderStats.resetRequestCounts() 435 } 436 } 437 438 // Inform the concurrency manager that the lease holder has been updated. 439 r.concMgr.OnRangeLeaseUpdated(iAmTheLeaseHolder) 440 441 // Potentially re-gossip if the range contains system data (e.g. system 442 // config or node liveness). We need to perform this gossip at startup as 443 // soon as possible. Trying to minimize how often we gossip is a fool's 444 // errand. The node liveness info will be gossiped frequently (every few 445 // seconds) in any case due to the liveness heartbeats. And the system config 446 // will be gossiped rarely because it falls on a range with an epoch-based 447 // range lease that is only reacquired extremely infrequently. 448 if iAmTheLeaseHolder { 449 if err := r.MaybeGossipSystemConfig(ctx); err != nil { 450 log.Errorf(ctx, "%v", err) 451 } 452 if err := r.MaybeGossipNodeLiveness(ctx, keys.NodeLivenessSpan); err != nil { 453 log.Errorf(ctx, "%v", err) 454 } 455 456 // Emit an MLAI on the leaseholder replica, as follower will be looking 457 // for one and if we went on to quiesce, they wouldn't necessarily get 458 // one otherwise (unless they ask for it, which adds latency). 459 r.EmitMLAI() 460 461 if leaseChangingHands && log.V(1) { 462 // This logging is useful to troubleshoot incomplete drains. 463 log.Info(ctx, "is now leaseholder") 464 } 465 } 466 467 // Mark the new lease in the replica's lease history. 468 if r.leaseHistory != nil { 469 r.leaseHistory.add(newLease) 470 } 471 } 472 473 func addSSTablePreApply( 474 ctx context.Context, 475 st *cluster.Settings, 476 eng storage.Engine, 477 sideloaded SideloadStorage, 478 term, index uint64, 479 sst kvserverpb.ReplicatedEvalResult_AddSSTable, 480 limiter *rate.Limiter, 481 ) bool { 482 checksum := util.CRC32(sst.Data) 483 484 if checksum != sst.CRC32 { 485 log.Fatalf( 486 ctx, 487 "checksum for AddSSTable at index term %d, index %d does not match; at proposal time %x (%d), now %x (%d)", 488 term, index, sst.CRC32, sst.CRC32, checksum, checksum, 489 ) 490 } 491 492 path, err := sideloaded.Filename(ctx, index, term) 493 if err != nil { 494 log.Fatalf(ctx, "sideloaded SSTable at term %d, index %d is missing", term, index) 495 } 496 497 eng.PreIngestDelay(ctx) 498 499 copied := false 500 if eng.InMem() { 501 path = fmt.Sprintf("%x", checksum) 502 if err := eng.WriteFile(path, sst.Data); err != nil { 503 panic(err) 504 } 505 } else { 506 ingestPath := path + ".ingested" 507 508 canLinkToRaftFile := false 509 // The SST may already be on disk, thanks to the sideloading mechanism. If 510 // so we can try to add that file directly, via a new hardlink if the file- 511 // system support it, rather than writing a new copy of it. However, this is 512 // only safe if we can do so without modifying the file since it is still 513 // part of an immutable raft log message, but in some cases, described in 514 // DBIngestExternalFile, RocksDB would modify the file. Fortunately we can 515 // tell Rocks that it is not allowed to modify the file, in which case it 516 // will return and error if it would have tried to do so, at which point we 517 // can fall back to writing a new copy for Rocks to ingest. 518 if _, links, err := sysutil.StatAndLinkCount(path); err == nil { 519 // HACK: RocksDB does not like ingesting the same file (by inode) twice. 520 // See facebook/rocksdb#5133. We can tell that we have tried to ingest 521 // this file already if it has more than one link – one from the file raft 522 // wrote and one from rocks. In that case, we should not try to give 523 // rocks a link to the same file again. 524 if links == 1 { 525 canLinkToRaftFile = true 526 } else { 527 log.Warningf(ctx, "SSTable at index %d term %d may have already been ingested (link count %d) -- falling back to ingesting a copy", 528 index, term, links) 529 } 530 } 531 532 if canLinkToRaftFile { 533 // If the fs supports it, make a hard-link for rocks to ingest. We cannot 534 // pass it the path in the sideload store as it deletes the passed path on 535 // success. 536 if linkErr := eng.Link(path, ingestPath); linkErr == nil { 537 ingestErr := eng.IngestExternalFiles(ctx, []string{ingestPath}) 538 if ingestErr == nil { 539 // Adding without modification succeeded, no copy necessary. 540 log.Eventf(ctx, "ingested SSTable at index %d, term %d: %s", index, term, ingestPath) 541 return false 542 } 543 if rmErr := eng.Remove(ingestPath); rmErr != nil { 544 log.Fatalf(ctx, "failed to move ingest sst: %v", rmErr) 545 } 546 const seqNoMsg = "Global seqno is required, but disabled" 547 const seqNoOnReIngest = "external file have non zero sequence number" 548 // Repeated ingestion is still possible even with the link count checked 549 // above, since rocks might have already compacted away the file. 550 // However it does not flush compacted files from its cache, so it can 551 // still react poorly to attempting to ingest again. If we get an error 552 // that indicates we can't ingest, we'll make a copy and try again. That 553 // attempt must succeed or we'll fatal, so any persistent error is still 554 // going to be surfaced. 555 ingestErrMsg := ingestErr.Error() 556 isSeqNoErr := strings.Contains(ingestErrMsg, seqNoMsg) || strings.Contains(ingestErrMsg, seqNoOnReIngest) 557 if ingestErr := (*storage.Error)(nil); !errors.As(err, &ingestErr) || !isSeqNoErr { 558 log.Fatalf(ctx, "while ingesting %s: %v", ingestPath, ingestErr) 559 } 560 } 561 } 562 563 path = ingestPath 564 565 log.Eventf(ctx, "copying SSTable for ingestion at index %d, term %d: %s", index, term, path) 566 567 // TODO(tschottdorf): remove this once sideloaded storage guarantees its 568 // existence. 569 if err := os.MkdirAll(filepath.Dir(path), 0700); err != nil { 570 panic(err) 571 } 572 if _, err := os.Stat(path); err == nil { 573 // The file we want to ingest exists. This can happen since the 574 // ingestion may apply twice (we ingest before we mark the Raft 575 // command as committed). Just unlink the file (RocksDB created a 576 // hard link); after that we're free to write it again. 577 if err := os.Remove(path); err != nil { 578 log.Fatalf(ctx, "while removing existing file during ingestion of %s: %+v", path, err) 579 } 580 } 581 582 if err := writeFileSyncing(ctx, path, sst.Data, eng, 0600, st, limiter); err != nil { 583 log.Fatalf(ctx, "while ingesting %s: %+v", path, err) 584 } 585 copied = true 586 } 587 588 if err := eng.IngestExternalFiles(ctx, []string{path}); err != nil { 589 log.Fatalf(ctx, "while ingesting %s: %+v", path, err) 590 } 591 log.Eventf(ctx, "ingested SSTable at index %d, term %d: %s", index, term, path) 592 return copied 593 } 594 595 func (r *Replica) handleReadWriteLocalEvalResult(ctx context.Context, lResult result.LocalResult) { 596 // Fields for which no action is taken in this method are zeroed so that 597 // they don't trigger an assertion at the end of the method (which checks 598 // that all fields were handled). 599 { 600 lResult.Reply = nil 601 } 602 603 // The caller is required to detach and handle the following three fields. 604 if lResult.EncounteredIntents != nil { 605 log.Fatalf(ctx, "LocalEvalResult.EncounteredIntents should be nil: %+v", lResult.EncounteredIntents) 606 } 607 if lResult.EndTxns != nil { 608 log.Fatalf(ctx, "LocalEvalResult.EndTxns should be nil: %+v", lResult.EndTxns) 609 } 610 if lResult.MaybeWatchForMerge { 611 log.Fatalf(ctx, "LocalEvalResult.MaybeWatchForMerge should be false") 612 } 613 614 if lResult.AcquiredLocks != nil { 615 for i := range lResult.AcquiredLocks { 616 r.concMgr.OnLockAcquired(ctx, &lResult.AcquiredLocks[i]) 617 } 618 lResult.AcquiredLocks = nil 619 } 620 621 if lResult.ResolvedLocks != nil { 622 for i := range lResult.ResolvedLocks { 623 r.concMgr.OnLockUpdated(ctx, &lResult.ResolvedLocks[i]) 624 } 625 lResult.ResolvedLocks = nil 626 } 627 628 if lResult.UpdatedTxns != nil { 629 for _, txn := range lResult.UpdatedTxns { 630 r.concMgr.OnTransactionUpdated(ctx, txn) 631 } 632 lResult.UpdatedTxns = nil 633 } 634 635 if lResult.GossipFirstRange { 636 // We need to run the gossip in an async task because gossiping requires 637 // the range lease and we'll deadlock if we try to acquire it while 638 // holding processRaftMu. Specifically, Replica.redirectOnOrAcquireLease 639 // blocks waiting for the lease acquisition to finish but it can't finish 640 // because we're not processing raft messages due to holding 641 // processRaftMu (and running on the processRaft goroutine). 642 if err := r.store.Stopper().RunAsyncTask( 643 ctx, "storage.Replica: gossipping first range", 644 func(ctx context.Context) { 645 hasLease, pErr := r.getLeaseForGossip(ctx) 646 647 if pErr != nil { 648 log.Infof(ctx, "unable to gossip first range; hasLease=%t, err=%s", hasLease, pErr) 649 } else if !hasLease { 650 return 651 } 652 r.gossipFirstRange(ctx) 653 }); err != nil { 654 log.Infof(ctx, "unable to gossip first range: %s", err) 655 } 656 lResult.GossipFirstRange = false 657 } 658 659 if lResult.MaybeAddToSplitQueue { 660 r.store.splitQueue.MaybeAddAsync(ctx, r, r.store.Clock().Now()) 661 lResult.MaybeAddToSplitQueue = false 662 } 663 664 if lResult.MaybeGossipSystemConfig { 665 if err := r.MaybeGossipSystemConfig(ctx); err != nil { 666 log.Errorf(ctx, "%v", err) 667 } 668 lResult.MaybeGossipSystemConfig = false 669 } 670 671 if lResult.MaybeGossipSystemConfigIfHaveFailure { 672 if err := r.MaybeGossipSystemConfigIfHaveFailure(ctx); err != nil { 673 log.Errorf(ctx, "%v", err) 674 } 675 lResult.MaybeGossipSystemConfigIfHaveFailure = false 676 } 677 678 if lResult.MaybeGossipNodeLiveness != nil { 679 if err := r.MaybeGossipNodeLiveness(ctx, *lResult.MaybeGossipNodeLiveness); err != nil { 680 log.Errorf(ctx, "%v", err) 681 } 682 lResult.MaybeGossipNodeLiveness = nil 683 } 684 685 if lResult.Metrics != nil { 686 r.store.metrics.handleMetricsResult(ctx, *lResult.Metrics) 687 lResult.Metrics = nil 688 } 689 690 if !lResult.IsZero() { 691 log.Fatalf(ctx, "unhandled field in LocalEvalResult: %s", pretty.Diff(lResult, result.LocalResult{})) 692 } 693 } 694 695 // proposalResult indicates the result of a proposal. Exactly one of 696 // Reply and Err is set, and it represents the result of the proposal. 697 type proposalResult struct { 698 Reply *roachpb.BatchResponse 699 Err *roachpb.Error 700 EncounteredIntents []roachpb.Intent 701 EndTxns []result.EndTxnIntents 702 } 703 704 // evaluateProposal generates a Result from the given request by 705 // evaluating it, returning both state which is held only on the 706 // proposer and that which is to be replicated through Raft. The 707 // return value is ready to be inserted into Replica's proposal map 708 // and subsequently passed to submitProposalLocked. 709 // 710 // The method also returns a flag indicating if the request needs to 711 // be proposed through Raft and replicated. This flag will be false 712 // either if the request was a no-op or if it hit an error. In this 713 // case, the result can be sent directly back to the client without 714 // going through Raft, but while still handling LocalEvalResult. 715 // 716 // Replica.mu must not be held. 717 func (r *Replica) evaluateProposal( 718 ctx context.Context, 719 idKey kvserverbase.CmdIDKey, 720 ba *roachpb.BatchRequest, 721 latchSpans *spanset.SpanSet, 722 ) (*result.Result, bool, *roachpb.Error) { 723 if ba.Timestamp == (hlc.Timestamp{}) { 724 return nil, false, roachpb.NewErrorf("can't propose Raft command with zero timestamp") 725 } 726 727 // Evaluate the commands. If this returns without an error, the batch should 728 // be committed. Note that we don't hold any locks at this point. This is 729 // important since evaluating a proposal is expensive. 730 // TODO(tschottdorf): absorb all returned values in `res` below this point 731 // in the call stack as well. 732 batch, ms, br, res, pErr := r.evaluateWriteBatch(ctx, idKey, ba, latchSpans) 733 734 // Note: reusing the proposer's batch when applying the command on the 735 // proposer was explored as an optimization but resulted in no performance 736 // benefit. 737 if batch != nil { 738 defer batch.Close() 739 } 740 741 if pErr != nil { 742 pErr = r.maybeSetCorrupt(ctx, pErr) 743 744 txn := pErr.GetTxn() 745 if txn != nil && ba.Txn == nil { 746 log.Fatalf(ctx, "error had a txn but batch is non-transactional. Err txn: %s", txn) 747 } 748 749 // Failed proposals can't have any Result except for what's 750 // whitelisted here. 751 res.Local = result.LocalResult{ 752 EncounteredIntents: res.Local.DetachEncounteredIntents(), 753 EndTxns: res.Local.DetachEndTxns(true /* alwaysOnly */), 754 Metrics: res.Local.Metrics, 755 } 756 res.Replicated.Reset() 757 return &res, false /* needConsensus */, pErr 758 } 759 760 // Set the local reply, which is held only on the proposing replica and is 761 // returned to the client after the proposal completes, or immediately if 762 // replication is not necessary. 763 res.Local.Reply = br 764 765 // needConsensus determines if the result needs to be replicated and 766 // proposed through Raft. This is necessary if at least one of the 767 // following conditions is true: 768 // 1. the request created a non-empty write batch. 769 // 2. the request had an impact on the MVCCStats. NB: this is possible 770 // even with an empty write batch when stats are recomputed. 771 // 3. the request has replicated side-effects. 772 needConsensus := !batch.Empty() || 773 ms != (enginepb.MVCCStats{}) || 774 !res.Replicated.Equal(kvserverpb.ReplicatedEvalResult{}) 775 776 if needConsensus { 777 // Set the proposal's WriteBatch, which is the serialized representation of 778 // the proposals effect on RocksDB. 779 res.WriteBatch = &kvserverpb.WriteBatch{ 780 Data: batch.Repr(), 781 } 782 783 // Set the proposal's replicated result, which contains metadata and 784 // side-effects that are to be replicated to all replicas. 785 res.Replicated.IsLeaseRequest = ba.IsLeaseRequest() 786 res.Replicated.Timestamp = ba.Timestamp 787 res.Replicated.Delta = ms.ToStatsDelta() 788 789 _ = clusterversion.VersionContainsEstimatesCounter // see for info on ContainsEstimates migration 790 if r.ClusterSettings().Version.IsActive(ctx, clusterversion.VersionContainsEstimatesCounter) { 791 // Encode that this command (and any that follow) uses regular arithmetic for ContainsEstimates 792 // by making sure ContainsEstimates is > 1. 793 // This will be interpreted during command application. 794 if res.Replicated.Delta.ContainsEstimates > 0 { 795 res.Replicated.Delta.ContainsEstimates *= 2 796 } 797 } else { 798 // This range may still need to have its commands processed by nodes which treat ContainsEstimates 799 // as a bool, so clamp it to {0,1}. This enables use of bool semantics in command application. 800 if res.Replicated.Delta.ContainsEstimates > 1 { 801 res.Replicated.Delta.ContainsEstimates = 1 802 } else if res.Replicated.Delta.ContainsEstimates < 0 { 803 // The caller should have checked the cluster version. At the 804 // time of writing, this is only RecomputeStats and the split 805 // trigger, which both have the check, but better safe than sorry. 806 log.Fatalf(ctx, "cannot propose negative ContainsEstimates "+ 807 "without VersionContainsEstimatesCounter in %s", ba.Summary()) 808 } 809 } 810 811 // If the RangeAppliedState key is not being used and the cluster version is 812 // high enough to guarantee that all current and future binaries will 813 // understand the key, we send the migration flag through Raft. Because 814 // there is a delay between command proposal and application, we may end up 815 // setting this migration flag multiple times. This is ok, because the 816 // migration is idempotent. 817 // TODO(nvanbenschoten): This will be baked in to 2.1, so it can be removed 818 // in the 2.2 release. 819 r.mu.RLock() 820 usingAppliedStateKey := r.mu.state.UsingAppliedStateKey 821 r.mu.RUnlock() 822 if !usingAppliedStateKey { 823 // The range applied state was introduced in v2.1. It's possible to 824 // still find ranges that haven't activated it. If so, activate it. 825 // We can remove this code if we introduce a boot-time check that 826 // fails the startup process when any legacy replicas are found. The 827 // operator can then run the old binary for a while to upgrade the 828 // stragglers. 829 if res.Replicated.State == nil { 830 res.Replicated.State = &kvserverpb.ReplicaState{} 831 } 832 res.Replicated.State.UsingAppliedStateKey = true 833 } 834 } 835 836 return &res, needConsensus, nil 837 } 838 839 // requestToProposal converts a BatchRequest into a ProposalData, by 840 // evaluating it. The returned ProposalData is partially valid even 841 // on a non-nil *roachpb.Error and should be proposed through Raft 842 // if ProposalData.command is non-nil. 843 // 844 // TODO(nvanbenschoten): combine idKey, ba, and latchSpans into a 845 // `serializedRequest` struct. 846 func (r *Replica) requestToProposal( 847 ctx context.Context, 848 idKey kvserverbase.CmdIDKey, 849 ba *roachpb.BatchRequest, 850 latchSpans *spanset.SpanSet, 851 ) (*ProposalData, *roachpb.Error) { 852 res, needConsensus, pErr := r.evaluateProposal(ctx, idKey, ba, latchSpans) 853 854 // Fill out the results even if pErr != nil; we'll return the error below. 855 proposal := &ProposalData{ 856 ctx: ctx, 857 idKey: idKey, 858 doneCh: make(chan proposalResult, 1), 859 Local: &res.Local, 860 Request: ba, 861 } 862 863 if needConsensus { 864 proposal.command = &kvserverpb.RaftCommand{ 865 ReplicatedEvalResult: res.Replicated, 866 WriteBatch: res.WriteBatch, 867 LogicalOpLog: res.LogicalOpLog, 868 TraceData: r.getTraceData(ctx), 869 } 870 } 871 872 return proposal, pErr 873 } 874 875 // getTraceData extracts the SpanContext of the current span. 876 func (r *Replica) getTraceData(ctx context.Context) opentracing.TextMapCarrier { 877 sp := opentracing.SpanFromContext(ctx) 878 if sp == nil { 879 return nil 880 } 881 if tracing.IsBlackHoleSpan(sp) { 882 return nil 883 } 884 traceData := opentracing.TextMapCarrier{} 885 if err := r.AmbientContext.Tracer.Inject( 886 sp.Context(), opentracing.TextMap, traceData, 887 ); err != nil { 888 log.Errorf(ctx, "failed to inject sp context (%+v) as trace data: %s", sp.Context(), err) 889 return nil 890 } 891 return traceData 892 }