github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/kvserverpb/proposer_kv.proto (about) 1 // Copyright 2016 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 syntax = "proto3"; 12 package cockroach.kv.kvserver.storagepb; 13 option go_package = "kvserverpb"; 14 15 import "roachpb/api.proto"; 16 import "roachpb/data.proto"; 17 import "roachpb/metadata.proto"; 18 import "storage/enginepb/mvcc.proto"; 19 import "storage/enginepb/mvcc3.proto"; 20 import "kv/kvserver/kvserverpb/state.proto"; 21 import "util/hlc/timestamp.proto"; 22 23 import "gogoproto/gogo.proto"; 24 25 // Split is emitted when a Replica commits a split trigger. It signals that the 26 // Replica has prepared the on-disk state for both the left and right hand 27 // sides of the split, and that the left hand side Replica should be updated as 28 // well as the right hand side created. 29 message Split { 30 option (gogoproto.equal) = true; 31 32 roachpb.SplitTrigger trigger = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true]; 33 // RHSDelta holds the statistics for what was written to what is now the 34 // right-hand side of the split during the batch which executed it. 35 // The on-disk state of the right-hand side is already correct, but the 36 // Store must learn about this delta to update its counters appropriately. 37 storage.enginepb.MVCCStats rhs_delta = 2 [(gogoproto.nullable) = false, 38 (gogoproto.customname) = "RHSDelta"]; 39 } 40 41 // Merge is emitted by a Replica which commits a transaction with 42 // a MergeTrigger (i.e. absorbs its right neighbor). 43 message Merge { 44 option (gogoproto.equal) = true; 45 46 roachpb.MergeTrigger trigger = 1 [(gogoproto.nullable) = false, 47 (gogoproto.embed) = true]; 48 } 49 50 // ChangeReplicas is emitted by a Replica which commits a transaction with 51 // a ChangeReplicasTrigger. 52 message ChangeReplicas { 53 option (gogoproto.equal) = true; 54 55 option (gogoproto.goproto_stringer) = false; 56 57 roachpb.ChangeReplicasTrigger trigger = 1 [(gogoproto.nullable) = false, 58 (gogoproto.embed) = true]; 59 } 60 61 // ComputeChecksum is emitted when a ComputeChecksum request is evaluated. It 62 // instructs the replica to compute a checksum at the time the command is 63 // applied. 64 message ComputeChecksum { 65 option (gogoproto.equal) = true; 66 67 // ChecksumID is a handle by which the checksum can be retrieved in a later 68 // CollectChecksum request. 69 bytes checksum_id = 1 [ 70 (gogoproto.nullable) = false, 71 (gogoproto.customname) = "ChecksumID", 72 (gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/util/uuid.UUID" 73 ]; 74 75 // The version used to pick the checksum method. Only when the version matches 76 // that hardcoded in the binary will a computation be carried out. 77 uint32 version = 5; 78 79 // SaveSnapshot indicates that the snapshot used to compute the checksum 80 // should be saved so that a diff of divergent replicas can later be computed. 81 bool save_snapshot = 2; 82 roachpb.ChecksumMode mode = 3; 83 // If set, a checkpoint (i.e. cheap backup) of the engine will be taken. This 84 // is expected to be set only if we already know that there is an 85 // inconsistency and we want to preserve as much state as possible. 86 bool checkpoint = 4; 87 // Replicas processing this command which find themselves in this slice will 88 // terminate. See `CheckConsistencyRequest.Terminate`. 89 repeated roachpb.ReplicaDescriptor terminate = 6 [(gogoproto.nullable) = false]; 90 } 91 92 // Compaction holds core details about a suggested compaction. 93 message Compaction { 94 option (gogoproto.equal) = true; 95 96 // bytes indicates the expected space reclamation from compaction. 97 int64 bytes = 1; 98 // suggested_at is nanoseconds since the epoch. 99 int64 suggested_at_nanos = 2; 100 } 101 102 // SuggestedCompaction holds start and end keys in conjunction with 103 // the compaction details. 104 message SuggestedCompaction { 105 option (gogoproto.equal) = true; 106 107 bytes start_key = 1 [(gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.Key"]; 108 bytes end_key = 2 [(gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.Key"]; 109 110 Compaction compaction = 3 [(gogoproto.nullable) = false, (gogoproto.embed) = true]; 111 } 112 113 // ReplicatedEvalResult is the structured information which together with 114 // a RocksDB WriteBatch constitutes the proposal payload in proposer-evaluated 115 // KV. For the majority of proposals, we expect ReplicatedEvalResult to be 116 // trivial; only changes to the metadata state (splits, merges, rebalances, 117 // leases, log truncation, ...) of the Replica or certain special commands must 118 // sideline information here based on which all Replicas must take action. 119 message ReplicatedEvalResult { 120 option (gogoproto.equal) = true; 121 122 // Updates to the Replica's ReplicaState. By convention and as outlined on 123 // the comment on the ReplicaState message, this field is sparsely populated 124 // and any field set overwrites the corresponding field in the state, perhaps 125 // with additional side effects (for instance on a descriptor update). 126 kv.kvserver.storagepb.ReplicaState state = 2; 127 Split split = 3; 128 Merge merge = 4; 129 ComputeChecksum compute_checksum = 21; 130 bool is_lease_request = 6; 131 // Duplicates BatchRequest.Timestamp for proposer-evaluated KV. Used 132 // to verify the validity of the command (for lease coverage and GC 133 // threshold). 134 util.hlc.Timestamp timestamp = 8 [(gogoproto.nullable) = false]; 135 // The stats delta corresponding to the data in this WriteBatch. On 136 // a split, contains only the contributions to the left-hand side. 137 storage.enginepb.MVCCStats deprecated_delta = 10; // See #18828 138 storage.enginepb.MVCCStatsDelta delta = 18 [(gogoproto.nullable) = false]; 139 ChangeReplicas change_replicas = 12; 140 int64 raft_log_delta = 13; 141 142 // AddSSTable is a side effect that must execute before the Raft application 143 // is committed. It must be idempotent to account for an ill-timed crash after 144 // applying the side effect, but before committing the batch. 145 // 146 // TODO(tschottdorf): additionally, after the crash, the node must not serve 147 // traffic until the persisted committed log has fully applied. Otherwise, we 148 // risk exposing data created through such a side effect whose corresponding 149 // Raft command hasn't committed yet. This isn't so much an issue with AddSSTable 150 // since these Ranges are not user-visible, but it is a general concern assuming 151 // other such side effects are added. 152 message AddSSTable { 153 option (gogoproto.equal) = true; 154 155 bytes data = 1; 156 uint32 crc32 = 2 [(gogoproto.customname) = "CRC32"]; 157 } 158 AddSSTable add_sstable = 17 [(gogoproto.customname) = "AddSSTable"]; 159 160 // suggested_compactions are sent to the engine's compactor to 161 // reclaim storage space after garbage collection or cleared / 162 // rebalanced ranges. 163 repeated SuggestedCompaction suggested_compactions = 19 [(gogoproto.nullable) = false]; 164 165 // This is the proposal timestamp for the active lease while evaluating a lease request. 166 // It will be used to make sure we know if a lease was extended after we sent out the request 167 // but before we tried to apply it. 168 util.hlc.Timestamp prev_lease_proposal = 20; 169 170 reserved 1, 5, 7, 9, 14, 15, 16, 10001 to 10013; 171 } 172 173 // WriteBatch is the serialized representation of a RocksDB write 174 // batch. A wrapper message is used so that the absence of the field 175 // can be distinguished from a zero-length batch, and so structs 176 // containing pointers to it can be compared with the == operator. 177 message WriteBatch { 178 bytes data = 1; 179 } 180 181 // LogicalOpLog is a log of logical MVCC operations. A wrapper message 182 // is used so that the absence of the field can be distinguished from a 183 // zero-length batch, and so structs containing pointers to it can be 184 // compared with the == operator. 185 message LogicalOpLog { 186 repeated storage.enginepb.MVCCLogicalOp ops = 1 [(gogoproto.nullable) = false]; 187 } 188 189 // RaftCommand is the message written to the raft log. It contains 190 // some metadata about the proposal itself, then either a BatchRequest 191 // (legacy mode) or a ReplicatedEvalResult + WriteBatch 192 // (proposer-evaluated KV mode). 193 message RaftCommand { 194 // Metadata about the proposal itself. These fields exist at 195 // top-level instead of being grouped in a sub-message for 196 // backwards-compatibility. 197 198 // proposer_lease_seq is provided to verify at raft command apply-time 199 // that the lease under which the command was proposed remains in effect. 200 // 201 // To see why lease verification downstream of Raft is required, consider the 202 // following example: 203 // - replica 1 receives a client request for a write 204 // - replica 1 checks the lease; the write is permitted 205 // - replica 1 proposes the command 206 // - time passes, replica 2 commits a new lease 207 // - the command applies on replica 1 208 // - replica 2 serves anomalous reads which don't see the write 209 // - the command applies on replica 2 210 int64 proposer_lease_sequence = 6 [(gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.LeaseSequence"]; 211 212 // deprecated_proposer_lease served the same purpose as proposer_lease_seq. 213 // As of VersionLeaseSequence, it is no longer in use. 214 // 215 // However, unless we add a check that all existing Raft logs on all nodes 216 // in the cluster contain only "new" leases, we won't be able to remove the 217 // legacy code. 218 roachpb.Lease deprecated_proposer_lease = 5; 219 220 // When the command is applied, its result is an error if the lease log 221 // counter has already reached (or exceeded) max_lease_index. 222 // 223 // The lease index is a reorder protection mechanism - we don't want Raft 224 // commands (proposed by a single node, the one with proposer_lease) executing 225 // in a different order than the one in which the corresponding KV requests 226 // were evaluated and the commands were proposed. This is important because 227 // latching does not fully serialize commands - mostly when it comes to 228 // updates to the internal state of the range (this should be re-evaluated 229 // once proposer-evaluated KV is completed - see #10413). 230 // Similar to the Raft applied index, it is strictly increasing, but may have 231 // gaps. A command will only apply successfully if its max_lease_index has not 232 // been surpassed by the Range's applied lease index (in which case the 233 // command may need to be retried, that is, regenerated with a higher 234 // max_lease_index). When the command applies, the new lease index will 235 // increase to max_lease_index (so a potential later replay will fail). 236 // 237 // This mechanism was introduced as a simpler alternative to using the Raft 238 // applied index, which is fraught with complexity due to the need to predict 239 // exactly the log position at which a command will apply, even when the Raft 240 // leader is not colocated with the lease holder (which usually proposes all 241 // commands). 242 // 243 // Pinning the lease-index to the assigned slot (as opposed to allowing gaps 244 // as we do now) is an interesting venue to explore from the standpoint of 245 // parallelization: One could hope to enforce command ordering in that way 246 // (without recourse to a higher-level locking primitive such as the command 247 // queue). This is a hard problem: First of all, managing the pending 248 // commands gets more involved; a command must not be removed if others have 249 // been added after it, and on removal, the assignment counters must be 250 // updated accordingly. Managing retry of proposals becomes trickier as 251 // well as that uproots whatever ordering was originally envisioned. 252 uint64 max_lease_index = 4; 253 254 reserved 3; 255 256 // Proposer-evaluated KV mode. 257 258 // replicated_eval_result is a set of structured information that instructs 259 // replicated state changes to the part of a Range's replicated state machine 260 // that exists outside of RocksDB. 261 ReplicatedEvalResult replicated_eval_result = 13 [(gogoproto.nullable) = false]; 262 // write_batch is a RocksDB WriteBatch that will be applied to RockDB during 263 // the application of the Raft command. The batch can be thought of as a 264 // series of replicated instructions that inform a RocksDB engine on how to 265 // change. 266 WriteBatch write_batch = 14; 267 // logical_op_log contains a series of logical MVCC operations that correspond 268 // to the physical operations being made in the write_batch. 269 LogicalOpLog logical_op_log = 15; 270 271 // trace_data, if not empty, contains details of proposer's trace as returned by 272 // Tracer.Inject(opentracing.TextMap). Used to create span for command 273 // application on all the replicas that "follow from" the proposer. 274 map<string, string> trace_data = 16; 275 276 reserved 1, 2, 10001 to 10014; 277 } 278 279 // RaftCommandFooter contains a subset of the fields in RaftCommand. It is used 280 // to optimize a pattern where most of the fields in RaftCommand are marshaled 281 // outside of a heavily contended critical section, except for the fields in the 282 // footer, which are assigned and marhsaled inside of the critical section and 283 // appended to the marshaled byte buffer. This minimizes the memory allocation 284 // and marshaling work performed under lock. 285 message RaftCommandFooter { 286 uint64 max_lease_index = 4; 287 }