github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/roachpb/metadata.proto (about)

     1  // Copyright 2014 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  syntax = "proto2";
    12  package cockroach.roachpb;
    13  option go_package = "roachpb";
    14  
    15  import "util/unresolved_addr.proto";
    16  import "util/hlc/timestamp.proto";
    17  import "gogoproto/gogo.proto";
    18  
    19  // Attributes specifies a list of arbitrary strings describing
    20  // node topology, store type, and machine capabilities.
    21  message Attributes {
    22    option (gogoproto.goproto_stringer) = false;
    23  
    24    repeated string attrs = 1 [(gogoproto.moretags) = "yaml:\"attrs,flow\""];
    25  }
    26  
    27  // ReplicationTarget identifies a node/store pair.
    28  message ReplicationTarget {
    29    option (gogoproto.goproto_stringer) = false;
    30    option (gogoproto.equal) = true;
    31  
    32    optional int32 node_id = 1 [(gogoproto.nullable) = false,
    33        (gogoproto.customname) = "NodeID", (gogoproto.casttype) = "NodeID"];
    34    optional int32 store_id = 2 [(gogoproto.nullable) = false,
    35        (gogoproto.customname) = "StoreID", (gogoproto.casttype) = "StoreID"];
    36  }
    37  
    38  // ReplicaType identifies which raft activities a replica participates in. In
    39  // normal operation, VOTER_FULL and LEARNER are the only used states. However,
    40  // atomic replication changes require a transition through a "joint config"; in
    41  // this joint config, the VOTER_DEMOTING and VOTER_INCOMING types are used as
    42  // well to denote voters which are being downgraded to learners and newly added
    43  // by the change, respectively. A demoting voter is turning into a learner,
    44  // which we prefer over a direct removal, which was used prior to v20.1 and
    45  // uses the VOTER_OUTGOING type instead (see VersionChangeReplicasDemotion for
    46  // details on why we're not doing that any more).
    47  //
    48  // All voter types indicate a replica that participates in all raft activities,
    49  // including voting for leadership and committing entries. Typically, this
    50  // requires a majority of voters to reach a decision. In a joint config, two
    51  // separate majorities are required: one from the set of replicas that have
    52  // either type VOTER or VOTER_OUTOING or VOTER_DEMOTING, as well as that of the
    53  // set of types VOTER and VOTER_INCOMING . For example, when type VOTER_FULL is
    54  // assigned to replicas 1 and 2, while 3 is VOTER_OUTGOING and 4 is
    55  // VOTER_INCOMING, then the two sets over which quorums need to be achieved are
    56  // {1,2,3} and {1,2,4}. Thus, {1,2} is a quorum of both, {1,3} is a quorum of
    57  // the first but not the second, {1,4} is a quorum of the second but not the
    58  // first, and {3,4} is a quorum of neither.
    59  enum ReplicaType {
    60    option (gogoproto.goproto_enum_prefix) = false;
    61  
    62    // VOTER_FULL indicates a replica that is a voter both in the
    63    // incoming and outgoing set.
    64    VOTER_FULL = 0;
    65    // VOTER_INCOMING indicates a voting replica that will be a
    66    // VOTER_FULL once the ongoing atomic replication change is finalized; that is,
    67    // it is in the process of being added. In practice, this replica type should
    68    // be treated like a VOTER_FULL.
    69    VOTER_INCOMING = 2;
    70    // VOTER_OUTGOING indicates a voting replica that will not be part
    71    // of the descriptor once the ongoing atomic replication change is finalized;
    72    // that is, it is in the process of being removed. In practice, a replica of
    73    // this type should be treated accordingly and no work should be assigned to
    74    // it.
    75    VOTER_OUTGOING = 3;
    76    // VOTER_DEMOTING indicates a voting replica that will become a learner once
    77    // the ongoing atomic replication change is finalized; that is, it is in the
    78    // process of being demoted. Since learners are currently short-lived, this
    79    // replica is really being removed, with an intermediate step, and no work
    80    // should be assigned to it.
    81    VOTER_DEMOTING = 4;
    82    // LEARNER indicates a replica that applies committed entries, but does not
    83    // count towards the quorum(s). Candidates will not ask for (or take into
    84    // account) votes of (peers they consider) LEARNERs for leadership nor do
    85    // their acknowledged log entries get taken into account for determining the
    86    // committed index. At the time of writing, learners in CockroachDB are a
    87    // short-term transient state: a replica being added and on its way to being a
    88    // VOTER_{FULL,INCOMING}, or a VOTER_DEMOTING being removed.
    89    LEARNER = 1;
    90  }
    91  
    92  // ReplicaDescriptor describes a replica location by node ID
    93  // (corresponds to a host:port via lookup on gossip network) and store
    94  // ID (identifies the device).
    95  // TODO(jeffreyxiao): All nullable fields in ReplicaDescriptor can be made
    96  // non-nullable if #38302 is guaranteed to be on all nodes (I.E. 20.1).
    97  message ReplicaDescriptor {
    98    option (gogoproto.goproto_stringer) = false;
    99    option (gogoproto.equal) = true;
   100    option (gogoproto.populate) = true;
   101  
   102    optional int32 node_id = 1 [(gogoproto.nullable) = false,
   103        (gogoproto.customname) = "NodeID", (gogoproto.casttype) = "NodeID"];
   104    optional int32 store_id = 2 [(gogoproto.nullable) = false,
   105        (gogoproto.customname) = "StoreID", (gogoproto.casttype) = "StoreID"];
   106  
   107    // replica_id uniquely identifies a replica instance. If a range is removed from
   108    // a store and then re-added to the same store, the new instance will have a
   109    // higher replica_id.
   110    optional int32 replica_id = 3 [(gogoproto.nullable) = false,
   111        (gogoproto.customname) = "ReplicaID", (gogoproto.casttype) = "ReplicaID"];
   112  
   113    // Type indicates which raft activities a replica participates in. A nil type
   114    // is equivalent to VOTER.
   115    optional ReplicaType type = 4;
   116  }
   117  
   118  // ReplicaIdent uniquely identifies a specific replica.
   119  message ReplicaIdent {
   120    optional int64 range_id = 1 [(gogoproto.nullable) = false,
   121        (gogoproto.customname) = "RangeID", (gogoproto.casttype) = "RangeID"];
   122    optional ReplicaDescriptor replica = 2 [(gogoproto.nullable) = false];
   123  }
   124  
   125  // RangeDescriptor is the value stored in a range metadata key.
   126  // A range is described using an inclusive start key, a non-inclusive end key,
   127  // and a list of replicas where the range is stored.
   128  //
   129  // NOTE: Care must be taken when changing the encoding of this proto
   130  // because it is used as part of conditional put operations.
   131  // TODO(jeffreyxiao): All nullable fields in RangeDescriptor can be made
   132  // non-nullable if #38302 is guaranteed to be on all nodes (I.E. 20.1).
   133  message RangeDescriptor {
   134    option (gogoproto.goproto_stringer) = false;
   135    option (gogoproto.equal) = true;
   136    option (gogoproto.populate) = true;
   137  
   138    optional int64 range_id = 1 [(gogoproto.nullable) = false,
   139        (gogoproto.customname) = "RangeID", (gogoproto.casttype) = "RangeID"];
   140    // start_key is the first key which may be contained by this range.
   141    optional bytes start_key = 2 [(gogoproto.casttype) = "RKey"];
   142    // end_key marks the end of the range's possible keys.  EndKey itself is not
   143    // contained in this range - it will be contained in the immediately
   144    // subsequent range.
   145    optional bytes end_key = 3 [(gogoproto.casttype) = "RKey"];
   146  
   147    // InternalReplicas is the is the set of nodes/stores on which replicas of
   148    // this range are stored. DO NOT USE this field directly, use the `Replicas`
   149    // method instead. The ordering is arbitrary and subject to permutation.
   150    repeated ReplicaDescriptor internal_replicas = 4 [(gogoproto.nullable) = false];
   151  
   152    // next_replica_id is a counter used to generate replica IDs.
   153    optional int32 next_replica_id = 5 [(gogoproto.nullable) = false,
   154        (gogoproto.customname) = "NextReplicaID", (gogoproto.casttype) = "ReplicaID"];
   155  
   156    // generation is incremented on every split, merge, and every replica change,
   157    // i.e., whenever the span of the range or replica set changes. It is
   158    // initialized to zero when the range is first created. The generation
   159    // counter was first introduced to allow the range descriptor resulting from
   160    // a split and then merge to be distinguishable from the initial range
   161    // descriptor. This is important since changes to the range descriptors use
   162    // CPuts to ensure mutual exclusion.
   163    //
   164    // See #28071 for details on the above.
   165    //
   166    // Generations are also useful to make local replicaGC decisions when applying
   167    // a snapshot on keyspace that has overlapping replicas (but note that we do
   168    // not use this at the time of writing due to migration concerns; see below).
   169    //
   170    // We want to be able to compare the snapshot range's generation counter to
   171    // that of the overlapping replicas to draw a conclusion about whether the
   172    // snapshot can be applied (in which case the overlapping replicas need to be
   173    // safely removable). To that end, on a split, not only do we increment the
   174    // left hand side's generation, we also copy the resultant generation to the
   175    // newly created right hand side. On merges, we update the left hand side's
   176    // generation so that it exceeds by one the maximum of the left hand side and
   177    // the right hand side's generations from before the merge.
   178    //
   179    // If two replicas (perhaps one of them represented by a raft or preemptive
   180    // snapshot) as defined by their full range descriptor (including, notably,
   181    // the generation) overlap, then one of them has to be stale. This is because
   182    // the keyspace cleanly shards into non-overlapping ranges at all times (i.e.
   183    // for all consistent snapshots). Since meta ranges (or more generally, range
   184    // descriptors) are only ever updated transactionally, mutations to the meta
   185    // ranges can be serialized (i.e. put into some sequential ordering). We know
   186    // that the descriptors corresponding to both of our replicas can't be from
   187    // the same consistent snapshot of the meta ranges, so there is a version of
   188    // the meta ranges that includes only the first replica, and there is a
   189    // version that includes only the second replica. Without loss of generality,
   190    // assume that the first version is "older". This means that there is a finite
   191    // sequence of splits and merges that were applied to the consistent snapshot
   192    // corresponding to the first version which resulted in the second version of
   193    // the meta ranges.
   194    //
   195    // Each individual operation, thanks to the generational semantics above, has
   196    // the invariant that the resulting descriptors have a strictly larger
   197    // generation than any descriptors from the previous version that they cover.
   198    // For example, if a descriptor [a,c) at generation 5 is split into [a,b) and
   199    // [b,c), both of those latter range descriptors have generation 6. If [c,d)
   200    // is at generation 12 and [d, f) is at generation 17, then the resulting
   201    // merged range [c,f) will have generation 18.
   202    //
   203    // At the end of the day, for incoming snapshots, this means that we only have
   204    // to collect the overlapping replicas and their generations. Any replica with
   205    // a smaller generation is stale by the above argument and can be replicaGC'ed
   206    // right away. Any replica with a larger generation indicates that the snapshot
   207    // is stale and should be discarded. A replica with the same generation is
   208    // necessarily a replica of the range the snapshot is addressing (this is the
   209    // usual case, in which a snapshot "overlaps" precisely one replica, which is
   210    // the replica it's supposed to update, and no splits and merges have taken
   211    // place at all).
   212    //
   213    // For a third note, observe that the generational semantics above may
   214    // possibly allow range merges without colocation, at least in the sense that
   215    // the counter examples in #28071 are defused. This is because the
   216    // generational counter can answer the question whether the overlapping
   217    // replica is gc'able or not. If it is not gc'able, then by definition the
   218    // replica applying the merge is.
   219    optional int64 generation = 6 [(gogoproto.nullable) = false];
   220    // The presence of the sticky_bit indicates that the range should not be
   221    // automatically merged by the merge queue with the range to its left. It is
   222    // set during a split operation and unset during an unsplit operation. Note
   223    // that the unsplit operation is a different operation from the merge
   224    // operation. Unsplit only unsets sticky_bit. It is represented by a
   225    // timestamp that indicates when it expires. After the expiration time has
   226    // passed, the split is eligible for automatic merging. A nil sticky bit is
   227    // equivalent to hlc.Timestamp{}.
   228    //
   229    // The reason the sticky_bit exists is because when the merge queue is
   230    // enabled and a manual split happens, the split ranges would immediately be
   231    // merged by the merge queue. Previous, we threw an error when a user
   232    // attempted to execute ALTER TABLE/INDEX ... SPLIT AT ... when the merge
   233    // queue is enabled. With sticky_bit, users can manually split ranges without
   234    // diabling the merge queue.
   235    optional util.hlc.Timestamp sticky_bit = 7;
   236  
   237    reserved 8;
   238  }
   239  
   240  // Percentiles contains a handful of hard-coded percentiles meant to summarize
   241  // a distribution.
   242  message Percentiles {
   243    option (gogoproto.goproto_stringer) = false;
   244  
   245    optional double p10 = 1 [(gogoproto.nullable) = false];
   246    optional double p25 = 2 [(gogoproto.nullable) = false];
   247    optional double p50 = 3 [(gogoproto.nullable) = false];
   248    optional double p75 = 4 [(gogoproto.nullable) = false];
   249    optional double p90 = 5 [(gogoproto.nullable) = false];
   250    optional double pMax = 6 [(gogoproto.nullable) = false];
   251  }
   252  
   253  // StoreCapacity contains capacity information for a storage device.
   254  message StoreCapacity {
   255    option (gogoproto.goproto_stringer) = false;
   256  
   257    // Total capacity of the disk used by the store, including space used by the
   258    // operating system and other applications.
   259    optional int64 capacity = 1 [(gogoproto.nullable) = false];
   260    // Available space remaining on the disk used by the store.
   261    optional int64 available = 2 [(gogoproto.nullable) = false];
   262    // Amount of disk space used by the data in the CockroachDB store. Note that
   263    // this is going to be less than (capacity - available), because those two
   264    // fields consider the entire disk and everything on it, while this only
   265    // tracks the store's disk usage.
   266    optional int64 used = 8 [(gogoproto.nullable) = false];
   267    // Amount of logical bytes stored in the store, ignoring RocksDB space
   268    // overhead. Useful for rebalancing so that moving a replica from one store
   269    // to another actually removes its bytes from the source store even though
   270    // RocksDB may not actually reclaim the physical disk space for a while.
   271    optional int64 logical_bytes = 9 [(gogoproto.nullable) = false];
   272    optional int32 range_count = 3 [(gogoproto.nullable) = false];
   273    optional int32 lease_count = 4 [(gogoproto.nullable) = false];
   274    // queries_per_second tracks the average number of queries processed per
   275    // second by replicas in the store. The stat is tracked over the time period
   276    // defined in storage/replica_stats.go, which as of July 2018 is 30 minutes.
   277    optional double queries_per_second = 10 [(gogoproto.nullable) = false];
   278    // writes_per_second tracks the average number of keys written per second
   279    // by ranges in the store. The stat is tracked over the time period defined
   280    // in storage/replica_stats.go, which as of July 2018 is 30 minutes.
   281    optional double writes_per_second = 5 [(gogoproto.nullable) = false];
   282    // bytes_per_replica and writes_per_replica contain percentiles for the
   283    // number of bytes and writes-per-second to each replica in the store.
   284    // This information can be used for rebalancing decisions.
   285    optional Percentiles bytes_per_replica = 6 [(gogoproto.nullable) = false];
   286    optional Percentiles writes_per_replica = 7 [(gogoproto.nullable) = false];
   287  }
   288  
   289  // NodeDescriptor holds details on node physical/network topology.
   290  message NodeDescriptor {
   291    optional int32 node_id = 1 [(gogoproto.nullable) = false,
   292        (gogoproto.customname) = "NodeID", (gogoproto.casttype) = "NodeID"];
   293    optional util.UnresolvedAddr address = 2 [(gogoproto.nullable) = false];
   294    optional Attributes attrs = 3 [(gogoproto.nullable) = false];
   295    optional Locality locality = 4 [(gogoproto.nullable) = false];
   296    optional Version ServerVersion = 5 [(gogoproto.nullable) = false];
   297    optional string build_tag = 6 [(gogoproto.nullable) = false];
   298    optional int64 started_at = 7 [(gogoproto.nullable) = false];
   299    repeated LocalityAddress locality_address = 8 [(gogoproto.nullable) = false];
   300    optional string cluster_name = 9 [(gogoproto.nullable) = false];
   301    // The SQL address. If empty, indicates that the base address field
   302    // is also used to accept SQL connections.
   303    optional util.UnresolvedAddr sql_address = 10 [(gogoproto.nullable) = false, (gogoproto.customname) = "SQLAddress"];
   304  }
   305  
   306  // LocalityAddress holds the private address accessible only from other nodes
   307  // in the corresponding locality.
   308  message LocalityAddress {
   309    optional util.UnresolvedAddr address = 1 [(gogoproto.nullable) = false];
   310    optional Tier locality_tier = 2 [(gogoproto.nullable) = false];
   311  }
   312  
   313  // StoreDescriptor holds store information including store attributes, node
   314  // descriptor and store capacity.
   315  message StoreDescriptor {
   316    optional int32 store_id = 1 [(gogoproto.nullable) = false,
   317        (gogoproto.customname) = "StoreID", (gogoproto.casttype) = "StoreID"];
   318    optional Attributes attrs = 2 [(gogoproto.nullable) = false];
   319    optional NodeDescriptor node = 3 [(gogoproto.nullable) = false];
   320    optional StoreCapacity capacity = 4 [(gogoproto.nullable) = false];
   321  }
   322  
   323  // StoreDeadReplicas holds a storeID and a list of dead replicas on that store.
   324  // Used to let the range lease holder know about corrupted or otherwise
   325  // destroyed replicas that should be transferred to a different store.
   326  message StoreDeadReplicas {
   327    optional int32 store_id = 1 [(gogoproto.nullable) = false,
   328        (gogoproto.customname) = "StoreID", (gogoproto.casttype) = "StoreID"];
   329    repeated ReplicaIdent replicas = 2 [(gogoproto.nullable) = false];
   330  }
   331  
   332  // Locality is an ordered set of key value Tiers that describe a node's
   333  // location. The tier keys should be the same across all nodes.
   334  message Locality {
   335    option (gogoproto.goproto_stringer) = false;
   336  
   337    repeated Tier tiers = 1 [(gogoproto.nullable) = false];
   338  }
   339  
   340  // Tier represents one level of the locality hierarchy.
   341  message Tier {
   342    option (gogoproto.goproto_stringer) = false;
   343  
   344    // Key is the name of tier and should match all other nodes.
   345    optional string key = 1 [(gogoproto.nullable) = false];
   346    // Value is node specific value corresponding to the key.
   347    optional string value = 2 [(gogoproto.nullable) = false];
   348  }
   349  
   350  message Version {
   351    option (gogoproto.goproto_stringer) = false;
   352  
   353    // The names "major" and "minor" are reserved in C in
   354    // some platforms (e.g. FreeBSD).
   355  
   356    optional int32 major_val = 1 [(gogoproto.nullable) = false, (gogoproto.customname) = "Major"];
   357    optional int32 minor_val = 2 [(gogoproto.nullable) = false, (gogoproto.customname) = "Minor"];
   358    // Note that patch is a placeholder and will always be zero.
   359    optional int32 patch = 3 [(gogoproto.nullable) = false];
   360    // The unstable version is used to migrate during development.
   361    // Users of stable, public releases will only use binaries
   362    // with unstable set to 0.
   363    optional int32 unstable = 4 [(gogoproto.nullable) = false];
   364  }