github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/kvserverpb/proposer_kv.proto (about)

     1  // Copyright 2016 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  syntax = "proto3";
    12  package cockroach.kv.kvserver.storagepb;
    13  option go_package = "kvserverpb";
    14  
    15  import "roachpb/api.proto";
    16  import "roachpb/data.proto";
    17  import "roachpb/metadata.proto";
    18  import "storage/enginepb/mvcc.proto";
    19  import "storage/enginepb/mvcc3.proto";
    20  import "kv/kvserver/kvserverpb/state.proto";
    21  import "util/hlc/timestamp.proto";
    22  
    23  import "gogoproto/gogo.proto";
    24  
    25  // Split is emitted when a Replica commits a split trigger. It signals that the
    26  // Replica has prepared the on-disk state for both the left and right hand
    27  // sides of the split, and that the left hand side Replica should be updated as
    28  // well as the right hand side created.
    29  message Split {
    30    option (gogoproto.equal) = true;
    31  
    32    roachpb.SplitTrigger trigger = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
    33    // RHSDelta holds the statistics for what was written to what is now the
    34    // right-hand side of the split during the batch which executed it.
    35    // The on-disk state of the right-hand side is already correct, but the
    36    // Store must learn about this delta to update its counters appropriately.
    37    storage.enginepb.MVCCStats rhs_delta = 2 [(gogoproto.nullable) = false,
    38      (gogoproto.customname) = "RHSDelta"];
    39  }
    40  
    41  // Merge is emitted by a Replica which commits a transaction with
    42  // a MergeTrigger (i.e. absorbs its right neighbor).
    43  message Merge {
    44    option (gogoproto.equal) = true;
    45  
    46    roachpb.MergeTrigger trigger = 1 [(gogoproto.nullable) = false,
    47      (gogoproto.embed) = true];
    48  }
    49  
    50  // ChangeReplicas is emitted by a Replica which commits a transaction with
    51  // a ChangeReplicasTrigger.
    52  message ChangeReplicas {
    53    option (gogoproto.equal) = true;
    54  
    55    option (gogoproto.goproto_stringer) = false;
    56  
    57    roachpb.ChangeReplicasTrigger trigger = 1 [(gogoproto.nullable) = false,
    58      (gogoproto.embed) = true];
    59  }
    60  
    61  // ComputeChecksum is emitted when a ComputeChecksum request is evaluated. It
    62  // instructs the replica to compute a checksum at the time the command is
    63  // applied.
    64  message ComputeChecksum {
    65    option (gogoproto.equal) = true;
    66  
    67    // ChecksumID is a handle by which the checksum can be retrieved in a later
    68    // CollectChecksum request.
    69    bytes checksum_id = 1 [
    70      (gogoproto.nullable) = false,
    71      (gogoproto.customname) = "ChecksumID",
    72      (gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/util/uuid.UUID"
    73    ];
    74  
    75    // The version used to pick the checksum method. Only when the version matches
    76    // that hardcoded in the binary will a computation be carried out.
    77    uint32 version = 5;
    78  
    79    // SaveSnapshot indicates that the snapshot used to compute the checksum
    80    // should be saved so that a diff of divergent replicas can later be computed.
    81    bool save_snapshot = 2;
    82    roachpb.ChecksumMode mode = 3;
    83    // If set, a checkpoint (i.e. cheap backup) of the engine will be taken. This
    84    // is expected to be set only if we already know that there is an
    85    // inconsistency and we want to preserve as much state as possible.
    86    bool checkpoint = 4;
    87    // Replicas processing this command which find themselves in this slice will
    88    // terminate. See `CheckConsistencyRequest.Terminate`.
    89    repeated roachpb.ReplicaDescriptor terminate = 6 [(gogoproto.nullable) = false];
    90  }
    91  
    92  // Compaction holds core details about a suggested compaction.
    93  message Compaction {
    94    option (gogoproto.equal) = true;
    95  
    96    // bytes indicates the expected space reclamation from compaction.
    97    int64 bytes = 1;
    98    // suggested_at is nanoseconds since the epoch.
    99    int64 suggested_at_nanos = 2;
   100  }
   101  
   102  // SuggestedCompaction holds start and end keys in conjunction with
   103  // the compaction details.
   104  message SuggestedCompaction {
   105    option (gogoproto.equal) = true;
   106  
   107    bytes start_key = 1 [(gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.Key"];
   108    bytes end_key = 2 [(gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.Key"];
   109  
   110    Compaction compaction = 3 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
   111  }
   112  
   113  // ReplicatedEvalResult is the structured information which together with
   114  // a RocksDB WriteBatch constitutes the proposal payload in proposer-evaluated
   115  // KV. For the majority of proposals, we expect ReplicatedEvalResult to be
   116  // trivial; only changes to the metadata state (splits, merges, rebalances,
   117  // leases, log truncation, ...) of the Replica or certain special commands must
   118  // sideline information here based on which all Replicas must take action.
   119  message ReplicatedEvalResult {
   120    option (gogoproto.equal) = true;
   121  
   122    // Updates to the Replica's ReplicaState. By convention and as outlined on
   123    // the comment on the ReplicaState message, this field is sparsely populated
   124    // and any field set overwrites the corresponding field in the state, perhaps
   125    // with additional side effects (for instance on a descriptor update).
   126    kv.kvserver.storagepb.ReplicaState state = 2;
   127    Split split = 3;
   128    Merge merge = 4;
   129    ComputeChecksum compute_checksum = 21;
   130    bool is_lease_request = 6;
   131    // Duplicates BatchRequest.Timestamp for proposer-evaluated KV. Used
   132    // to verify the validity of the command (for lease coverage and GC
   133    // threshold).
   134    util.hlc.Timestamp timestamp = 8 [(gogoproto.nullable) = false];
   135    // The stats delta corresponding to the data in this WriteBatch. On
   136    // a split, contains only the contributions to the left-hand side.
   137    storage.enginepb.MVCCStats deprecated_delta = 10; // See #18828
   138    storage.enginepb.MVCCStatsDelta delta = 18 [(gogoproto.nullable) = false];
   139    ChangeReplicas change_replicas = 12;
   140    int64 raft_log_delta = 13;
   141  
   142    // AddSSTable is a side effect that must execute before the Raft application
   143    // is committed. It must be idempotent to account for an ill-timed crash after
   144    // applying the side effect, but before committing the batch.
   145    //
   146    // TODO(tschottdorf): additionally, after the crash, the node must not serve
   147    // traffic until the persisted committed log has fully applied. Otherwise, we
   148    // risk exposing data created through such a side effect whose corresponding
   149    // Raft command hasn't committed yet. This isn't so much an issue with AddSSTable
   150    // since these Ranges are not user-visible, but it is a general concern assuming
   151    // other such side effects are added.
   152    message AddSSTable {
   153      option (gogoproto.equal) = true;
   154  
   155      bytes data = 1;
   156      uint32 crc32 = 2 [(gogoproto.customname) = "CRC32"];
   157    }
   158    AddSSTable add_sstable = 17 [(gogoproto.customname) = "AddSSTable"];
   159  
   160    // suggested_compactions are sent to the engine's compactor to
   161    // reclaim storage space after garbage collection or cleared /
   162    // rebalanced ranges.
   163    repeated SuggestedCompaction suggested_compactions = 19 [(gogoproto.nullable) = false];
   164  
   165    // This is the proposal timestamp for the active lease while evaluating a lease request.
   166    // It will be used to make sure we know if a lease was extended after we sent out the request
   167    // but before we tried to apply it.
   168    util.hlc.Timestamp prev_lease_proposal = 20;
   169  
   170    reserved 1, 5, 7, 9, 14, 15, 16, 10001 to 10013;
   171  }
   172  
   173  // WriteBatch is the serialized representation of a RocksDB write
   174  // batch. A wrapper message is used so that the absence of the field
   175  // can be distinguished from a zero-length batch, and so structs
   176  // containing pointers to it can be compared with the == operator.
   177  message WriteBatch {
   178    bytes data = 1;
   179  }
   180  
   181  // LogicalOpLog is a log of logical MVCC operations. A wrapper message
   182  // is used so that the absence of the field can be distinguished from a
   183  // zero-length batch, and so structs containing pointers to it can be
   184  // compared with the == operator.
   185  message LogicalOpLog {
   186    repeated storage.enginepb.MVCCLogicalOp ops = 1 [(gogoproto.nullable) = false];
   187  }
   188  
   189  // RaftCommand is the message written to the raft log. It contains
   190  // some metadata about the proposal itself, then either a BatchRequest
   191  // (legacy mode) or a ReplicatedEvalResult + WriteBatch
   192  // (proposer-evaluated KV mode).
   193  message RaftCommand {
   194    // Metadata about the proposal itself. These fields exist at
   195    // top-level instead of being grouped in a sub-message for
   196    // backwards-compatibility.
   197  
   198    // proposer_lease_seq is provided to verify at raft command apply-time
   199    // that the lease under which the command was proposed remains in effect.
   200    //
   201    // To see why lease verification downstream of Raft is required, consider the
   202    // following example:
   203    // - replica 1 receives a client request for a write
   204    // - replica 1 checks the lease; the write is permitted
   205    // - replica 1 proposes the command
   206    // - time passes, replica 2 commits a new lease
   207    // - the command applies on replica 1
   208    // - replica 2 serves anomalous reads which don't see the write
   209    // - the command applies on replica 2
   210    int64 proposer_lease_sequence = 6 [(gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.LeaseSequence"];
   211  
   212    // deprecated_proposer_lease served the same purpose as proposer_lease_seq.
   213    // As of VersionLeaseSequence, it is no longer in use.
   214    //
   215    // However, unless we add a check that all existing Raft logs on all nodes
   216    // in the cluster contain only "new" leases, we won't be able to remove the
   217    // legacy code.
   218    roachpb.Lease deprecated_proposer_lease = 5;
   219  
   220    // When the command is applied, its result is an error if the lease log
   221    // counter has already reached (or exceeded) max_lease_index.
   222    //
   223    // The lease index is a reorder protection mechanism - we don't want Raft
   224    // commands (proposed by a single node, the one with proposer_lease) executing
   225    // in a different order than the one in which the corresponding KV requests
   226    // were evaluated and the commands were proposed. This is important because
   227    // latching does not fully serialize commands - mostly when it comes to
   228    // updates to the internal state of the range (this should be re-evaluated
   229    // once proposer-evaluated KV is completed - see #10413).
   230    // Similar to the Raft applied index, it is strictly increasing, but may have
   231    // gaps. A command will only apply successfully if its max_lease_index has not
   232    // been surpassed by the Range's applied lease index (in which case the
   233    // command may need to be retried, that is, regenerated with a higher
   234    // max_lease_index). When the command applies, the new lease index will
   235    // increase to max_lease_index (so a potential later replay will fail).
   236    //
   237    // This mechanism was introduced as a simpler alternative to using the Raft
   238    // applied index, which is fraught with complexity due to the need to predict
   239    // exactly the log position at which a command will apply, even when the Raft
   240    // leader is not colocated with the lease holder (which usually proposes all
   241    // commands).
   242    //
   243    // Pinning the lease-index to the assigned slot (as opposed to allowing gaps
   244    // as we do now) is an interesting venue to explore from the standpoint of
   245    // parallelization: One could hope to enforce command ordering in that way
   246    // (without recourse to a higher-level locking primitive such as the command
   247    // queue). This is a hard problem: First of all, managing the pending
   248    // commands gets more involved; a command must not be removed if others have
   249    // been added after it, and on removal, the assignment counters must be
   250    // updated accordingly. Managing retry of proposals becomes trickier as
   251    // well as that uproots whatever ordering was originally envisioned.
   252    uint64 max_lease_index = 4;
   253  
   254    reserved 3;
   255  
   256    // Proposer-evaluated KV mode.
   257  
   258    // replicated_eval_result is a set of structured information that instructs
   259    // replicated state changes to the part of a Range's replicated state machine
   260    // that exists outside of RocksDB.
   261    ReplicatedEvalResult replicated_eval_result = 13 [(gogoproto.nullable) = false];
   262    // write_batch is a RocksDB WriteBatch that will be applied to RockDB during
   263    // the application of the Raft command. The batch can be thought of as a
   264    // series of replicated instructions that inform a RocksDB engine on how to
   265    // change.
   266    WriteBatch write_batch = 14;
   267    // logical_op_log contains a series of logical MVCC operations that correspond
   268    // to the physical operations being made in the write_batch.
   269    LogicalOpLog logical_op_log = 15;
   270  
   271    // trace_data, if not empty, contains details of proposer's trace as returned by
   272    // Tracer.Inject(opentracing.TextMap). Used to create span for command
   273    // application on all the replicas that "follow from" the proposer.
   274    map<string, string> trace_data = 16;
   275  
   276    reserved 1, 2, 10001 to 10014;
   277  }
   278  
   279  // RaftCommandFooter contains a subset of the fields in RaftCommand. It is used
   280  // to optimize a pattern where most of the fields in RaftCommand are marshaled
   281  // outside of a heavily contended critical section, except for the fields in the
   282  // footer, which are assigned and marhsaled inside of the critical section and
   283  // appended to the marshaled byte buffer. This minimizes the memory allocation
   284  // and marshaling work performed under lock.
   285  message RaftCommandFooter {
   286    uint64 max_lease_index = 4;
   287  }