github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/roachpb/metadata.proto (about) 1 // Copyright 2014 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 syntax = "proto2"; 12 package cockroach.roachpb; 13 option go_package = "roachpb"; 14 15 import "util/unresolved_addr.proto"; 16 import "util/hlc/timestamp.proto"; 17 import "gogoproto/gogo.proto"; 18 19 // Attributes specifies a list of arbitrary strings describing 20 // node topology, store type, and machine capabilities. 21 message Attributes { 22 option (gogoproto.goproto_stringer) = false; 23 24 repeated string attrs = 1 [(gogoproto.moretags) = "yaml:\"attrs,flow\""]; 25 } 26 27 // ReplicationTarget identifies a node/store pair. 28 message ReplicationTarget { 29 option (gogoproto.goproto_stringer) = false; 30 option (gogoproto.equal) = true; 31 32 optional int32 node_id = 1 [(gogoproto.nullable) = false, 33 (gogoproto.customname) = "NodeID", (gogoproto.casttype) = "NodeID"]; 34 optional int32 store_id = 2 [(gogoproto.nullable) = false, 35 (gogoproto.customname) = "StoreID", (gogoproto.casttype) = "StoreID"]; 36 } 37 38 // ReplicaType identifies which raft activities a replica participates in. In 39 // normal operation, VOTER_FULL and LEARNER are the only used states. However, 40 // atomic replication changes require a transition through a "joint config"; in 41 // this joint config, the VOTER_DEMOTING and VOTER_INCOMING types are used as 42 // well to denote voters which are being downgraded to learners and newly added 43 // by the change, respectively. A demoting voter is turning into a learner, 44 // which we prefer over a direct removal, which was used prior to v20.1 and 45 // uses the VOTER_OUTGOING type instead (see VersionChangeReplicasDemotion for 46 // details on why we're not doing that any more). 47 // 48 // All voter types indicate a replica that participates in all raft activities, 49 // including voting for leadership and committing entries. Typically, this 50 // requires a majority of voters to reach a decision. In a joint config, two 51 // separate majorities are required: one from the set of replicas that have 52 // either type VOTER or VOTER_OUTOING or VOTER_DEMOTING, as well as that of the 53 // set of types VOTER and VOTER_INCOMING . For example, when type VOTER_FULL is 54 // assigned to replicas 1 and 2, while 3 is VOTER_OUTGOING and 4 is 55 // VOTER_INCOMING, then the two sets over which quorums need to be achieved are 56 // {1,2,3} and {1,2,4}. Thus, {1,2} is a quorum of both, {1,3} is a quorum of 57 // the first but not the second, {1,4} is a quorum of the second but not the 58 // first, and {3,4} is a quorum of neither. 59 enum ReplicaType { 60 option (gogoproto.goproto_enum_prefix) = false; 61 62 // VOTER_FULL indicates a replica that is a voter both in the 63 // incoming and outgoing set. 64 VOTER_FULL = 0; 65 // VOTER_INCOMING indicates a voting replica that will be a 66 // VOTER_FULL once the ongoing atomic replication change is finalized; that is, 67 // it is in the process of being added. In practice, this replica type should 68 // be treated like a VOTER_FULL. 69 VOTER_INCOMING = 2; 70 // VOTER_OUTGOING indicates a voting replica that will not be part 71 // of the descriptor once the ongoing atomic replication change is finalized; 72 // that is, it is in the process of being removed. In practice, a replica of 73 // this type should be treated accordingly and no work should be assigned to 74 // it. 75 VOTER_OUTGOING = 3; 76 // VOTER_DEMOTING indicates a voting replica that will become a learner once 77 // the ongoing atomic replication change is finalized; that is, it is in the 78 // process of being demoted. Since learners are currently short-lived, this 79 // replica is really being removed, with an intermediate step, and no work 80 // should be assigned to it. 81 VOTER_DEMOTING = 4; 82 // LEARNER indicates a replica that applies committed entries, but does not 83 // count towards the quorum(s). Candidates will not ask for (or take into 84 // account) votes of (peers they consider) LEARNERs for leadership nor do 85 // their acknowledged log entries get taken into account for determining the 86 // committed index. At the time of writing, learners in CockroachDB are a 87 // short-term transient state: a replica being added and on its way to being a 88 // VOTER_{FULL,INCOMING}, or a VOTER_DEMOTING being removed. 89 LEARNER = 1; 90 } 91 92 // ReplicaDescriptor describes a replica location by node ID 93 // (corresponds to a host:port via lookup on gossip network) and store 94 // ID (identifies the device). 95 // TODO(jeffreyxiao): All nullable fields in ReplicaDescriptor can be made 96 // non-nullable if #38302 is guaranteed to be on all nodes (I.E. 20.1). 97 message ReplicaDescriptor { 98 option (gogoproto.goproto_stringer) = false; 99 option (gogoproto.equal) = true; 100 option (gogoproto.populate) = true; 101 102 optional int32 node_id = 1 [(gogoproto.nullable) = false, 103 (gogoproto.customname) = "NodeID", (gogoproto.casttype) = "NodeID"]; 104 optional int32 store_id = 2 [(gogoproto.nullable) = false, 105 (gogoproto.customname) = "StoreID", (gogoproto.casttype) = "StoreID"]; 106 107 // replica_id uniquely identifies a replica instance. If a range is removed from 108 // a store and then re-added to the same store, the new instance will have a 109 // higher replica_id. 110 optional int32 replica_id = 3 [(gogoproto.nullable) = false, 111 (gogoproto.customname) = "ReplicaID", (gogoproto.casttype) = "ReplicaID"]; 112 113 // Type indicates which raft activities a replica participates in. A nil type 114 // is equivalent to VOTER. 115 optional ReplicaType type = 4; 116 } 117 118 // ReplicaIdent uniquely identifies a specific replica. 119 message ReplicaIdent { 120 optional int64 range_id = 1 [(gogoproto.nullable) = false, 121 (gogoproto.customname) = "RangeID", (gogoproto.casttype) = "RangeID"]; 122 optional ReplicaDescriptor replica = 2 [(gogoproto.nullable) = false]; 123 } 124 125 // RangeDescriptor is the value stored in a range metadata key. 126 // A range is described using an inclusive start key, a non-inclusive end key, 127 // and a list of replicas where the range is stored. 128 // 129 // NOTE: Care must be taken when changing the encoding of this proto 130 // because it is used as part of conditional put operations. 131 // TODO(jeffreyxiao): All nullable fields in RangeDescriptor can be made 132 // non-nullable if #38302 is guaranteed to be on all nodes (I.E. 20.1). 133 message RangeDescriptor { 134 option (gogoproto.goproto_stringer) = false; 135 option (gogoproto.equal) = true; 136 option (gogoproto.populate) = true; 137 138 optional int64 range_id = 1 [(gogoproto.nullable) = false, 139 (gogoproto.customname) = "RangeID", (gogoproto.casttype) = "RangeID"]; 140 // start_key is the first key which may be contained by this range. 141 optional bytes start_key = 2 [(gogoproto.casttype) = "RKey"]; 142 // end_key marks the end of the range's possible keys. EndKey itself is not 143 // contained in this range - it will be contained in the immediately 144 // subsequent range. 145 optional bytes end_key = 3 [(gogoproto.casttype) = "RKey"]; 146 147 // InternalReplicas is the is the set of nodes/stores on which replicas of 148 // this range are stored. DO NOT USE this field directly, use the `Replicas` 149 // method instead. The ordering is arbitrary and subject to permutation. 150 repeated ReplicaDescriptor internal_replicas = 4 [(gogoproto.nullable) = false]; 151 152 // next_replica_id is a counter used to generate replica IDs. 153 optional int32 next_replica_id = 5 [(gogoproto.nullable) = false, 154 (gogoproto.customname) = "NextReplicaID", (gogoproto.casttype) = "ReplicaID"]; 155 156 // generation is incremented on every split, merge, and every replica change, 157 // i.e., whenever the span of the range or replica set changes. It is 158 // initialized to zero when the range is first created. The generation 159 // counter was first introduced to allow the range descriptor resulting from 160 // a split and then merge to be distinguishable from the initial range 161 // descriptor. This is important since changes to the range descriptors use 162 // CPuts to ensure mutual exclusion. 163 // 164 // See #28071 for details on the above. 165 // 166 // Generations are also useful to make local replicaGC decisions when applying 167 // a snapshot on keyspace that has overlapping replicas (but note that we do 168 // not use this at the time of writing due to migration concerns; see below). 169 // 170 // We want to be able to compare the snapshot range's generation counter to 171 // that of the overlapping replicas to draw a conclusion about whether the 172 // snapshot can be applied (in which case the overlapping replicas need to be 173 // safely removable). To that end, on a split, not only do we increment the 174 // left hand side's generation, we also copy the resultant generation to the 175 // newly created right hand side. On merges, we update the left hand side's 176 // generation so that it exceeds by one the maximum of the left hand side and 177 // the right hand side's generations from before the merge. 178 // 179 // If two replicas (perhaps one of them represented by a raft or preemptive 180 // snapshot) as defined by their full range descriptor (including, notably, 181 // the generation) overlap, then one of them has to be stale. This is because 182 // the keyspace cleanly shards into non-overlapping ranges at all times (i.e. 183 // for all consistent snapshots). Since meta ranges (or more generally, range 184 // descriptors) are only ever updated transactionally, mutations to the meta 185 // ranges can be serialized (i.e. put into some sequential ordering). We know 186 // that the descriptors corresponding to both of our replicas can't be from 187 // the same consistent snapshot of the meta ranges, so there is a version of 188 // the meta ranges that includes only the first replica, and there is a 189 // version that includes only the second replica. Without loss of generality, 190 // assume that the first version is "older". This means that there is a finite 191 // sequence of splits and merges that were applied to the consistent snapshot 192 // corresponding to the first version which resulted in the second version of 193 // the meta ranges. 194 // 195 // Each individual operation, thanks to the generational semantics above, has 196 // the invariant that the resulting descriptors have a strictly larger 197 // generation than any descriptors from the previous version that they cover. 198 // For example, if a descriptor [a,c) at generation 5 is split into [a,b) and 199 // [b,c), both of those latter range descriptors have generation 6. If [c,d) 200 // is at generation 12 and [d, f) is at generation 17, then the resulting 201 // merged range [c,f) will have generation 18. 202 // 203 // At the end of the day, for incoming snapshots, this means that we only have 204 // to collect the overlapping replicas and their generations. Any replica with 205 // a smaller generation is stale by the above argument and can be replicaGC'ed 206 // right away. Any replica with a larger generation indicates that the snapshot 207 // is stale and should be discarded. A replica with the same generation is 208 // necessarily a replica of the range the snapshot is addressing (this is the 209 // usual case, in which a snapshot "overlaps" precisely one replica, which is 210 // the replica it's supposed to update, and no splits and merges have taken 211 // place at all). 212 // 213 // For a third note, observe that the generational semantics above may 214 // possibly allow range merges without colocation, at least in the sense that 215 // the counter examples in #28071 are defused. This is because the 216 // generational counter can answer the question whether the overlapping 217 // replica is gc'able or not. If it is not gc'able, then by definition the 218 // replica applying the merge is. 219 optional int64 generation = 6 [(gogoproto.nullable) = false]; 220 // The presence of the sticky_bit indicates that the range should not be 221 // automatically merged by the merge queue with the range to its left. It is 222 // set during a split operation and unset during an unsplit operation. Note 223 // that the unsplit operation is a different operation from the merge 224 // operation. Unsplit only unsets sticky_bit. It is represented by a 225 // timestamp that indicates when it expires. After the expiration time has 226 // passed, the split is eligible for automatic merging. A nil sticky bit is 227 // equivalent to hlc.Timestamp{}. 228 // 229 // The reason the sticky_bit exists is because when the merge queue is 230 // enabled and a manual split happens, the split ranges would immediately be 231 // merged by the merge queue. Previous, we threw an error when a user 232 // attempted to execute ALTER TABLE/INDEX ... SPLIT AT ... when the merge 233 // queue is enabled. With sticky_bit, users can manually split ranges without 234 // diabling the merge queue. 235 optional util.hlc.Timestamp sticky_bit = 7; 236 237 reserved 8; 238 } 239 240 // Percentiles contains a handful of hard-coded percentiles meant to summarize 241 // a distribution. 242 message Percentiles { 243 option (gogoproto.goproto_stringer) = false; 244 245 optional double p10 = 1 [(gogoproto.nullable) = false]; 246 optional double p25 = 2 [(gogoproto.nullable) = false]; 247 optional double p50 = 3 [(gogoproto.nullable) = false]; 248 optional double p75 = 4 [(gogoproto.nullable) = false]; 249 optional double p90 = 5 [(gogoproto.nullable) = false]; 250 optional double pMax = 6 [(gogoproto.nullable) = false]; 251 } 252 253 // StoreCapacity contains capacity information for a storage device. 254 message StoreCapacity { 255 option (gogoproto.goproto_stringer) = false; 256 257 // Total capacity of the disk used by the store, including space used by the 258 // operating system and other applications. 259 optional int64 capacity = 1 [(gogoproto.nullable) = false]; 260 // Available space remaining on the disk used by the store. 261 optional int64 available = 2 [(gogoproto.nullable) = false]; 262 // Amount of disk space used by the data in the CockroachDB store. Note that 263 // this is going to be less than (capacity - available), because those two 264 // fields consider the entire disk and everything on it, while this only 265 // tracks the store's disk usage. 266 optional int64 used = 8 [(gogoproto.nullable) = false]; 267 // Amount of logical bytes stored in the store, ignoring RocksDB space 268 // overhead. Useful for rebalancing so that moving a replica from one store 269 // to another actually removes its bytes from the source store even though 270 // RocksDB may not actually reclaim the physical disk space for a while. 271 optional int64 logical_bytes = 9 [(gogoproto.nullable) = false]; 272 optional int32 range_count = 3 [(gogoproto.nullable) = false]; 273 optional int32 lease_count = 4 [(gogoproto.nullable) = false]; 274 // queries_per_second tracks the average number of queries processed per 275 // second by replicas in the store. The stat is tracked over the time period 276 // defined in storage/replica_stats.go, which as of July 2018 is 30 minutes. 277 optional double queries_per_second = 10 [(gogoproto.nullable) = false]; 278 // writes_per_second tracks the average number of keys written per second 279 // by ranges in the store. The stat is tracked over the time period defined 280 // in storage/replica_stats.go, which as of July 2018 is 30 minutes. 281 optional double writes_per_second = 5 [(gogoproto.nullable) = false]; 282 // bytes_per_replica and writes_per_replica contain percentiles for the 283 // number of bytes and writes-per-second to each replica in the store. 284 // This information can be used for rebalancing decisions. 285 optional Percentiles bytes_per_replica = 6 [(gogoproto.nullable) = false]; 286 optional Percentiles writes_per_replica = 7 [(gogoproto.nullable) = false]; 287 } 288 289 // NodeDescriptor holds details on node physical/network topology. 290 message NodeDescriptor { 291 optional int32 node_id = 1 [(gogoproto.nullable) = false, 292 (gogoproto.customname) = "NodeID", (gogoproto.casttype) = "NodeID"]; 293 optional util.UnresolvedAddr address = 2 [(gogoproto.nullable) = false]; 294 optional Attributes attrs = 3 [(gogoproto.nullable) = false]; 295 optional Locality locality = 4 [(gogoproto.nullable) = false]; 296 optional Version ServerVersion = 5 [(gogoproto.nullable) = false]; 297 optional string build_tag = 6 [(gogoproto.nullable) = false]; 298 optional int64 started_at = 7 [(gogoproto.nullable) = false]; 299 repeated LocalityAddress locality_address = 8 [(gogoproto.nullable) = false]; 300 optional string cluster_name = 9 [(gogoproto.nullable) = false]; 301 // The SQL address. If empty, indicates that the base address field 302 // is also used to accept SQL connections. 303 optional util.UnresolvedAddr sql_address = 10 [(gogoproto.nullable) = false, (gogoproto.customname) = "SQLAddress"]; 304 } 305 306 // LocalityAddress holds the private address accessible only from other nodes 307 // in the corresponding locality. 308 message LocalityAddress { 309 optional util.UnresolvedAddr address = 1 [(gogoproto.nullable) = false]; 310 optional Tier locality_tier = 2 [(gogoproto.nullable) = false]; 311 } 312 313 // StoreDescriptor holds store information including store attributes, node 314 // descriptor and store capacity. 315 message StoreDescriptor { 316 optional int32 store_id = 1 [(gogoproto.nullable) = false, 317 (gogoproto.customname) = "StoreID", (gogoproto.casttype) = "StoreID"]; 318 optional Attributes attrs = 2 [(gogoproto.nullable) = false]; 319 optional NodeDescriptor node = 3 [(gogoproto.nullable) = false]; 320 optional StoreCapacity capacity = 4 [(gogoproto.nullable) = false]; 321 } 322 323 // StoreDeadReplicas holds a storeID and a list of dead replicas on that store. 324 // Used to let the range lease holder know about corrupted or otherwise 325 // destroyed replicas that should be transferred to a different store. 326 message StoreDeadReplicas { 327 optional int32 store_id = 1 [(gogoproto.nullable) = false, 328 (gogoproto.customname) = "StoreID", (gogoproto.casttype) = "StoreID"]; 329 repeated ReplicaIdent replicas = 2 [(gogoproto.nullable) = false]; 330 } 331 332 // Locality is an ordered set of key value Tiers that describe a node's 333 // location. The tier keys should be the same across all nodes. 334 message Locality { 335 option (gogoproto.goproto_stringer) = false; 336 337 repeated Tier tiers = 1 [(gogoproto.nullable) = false]; 338 } 339 340 // Tier represents one level of the locality hierarchy. 341 message Tier { 342 option (gogoproto.goproto_stringer) = false; 343 344 // Key is the name of tier and should match all other nodes. 345 optional string key = 1 [(gogoproto.nullable) = false]; 346 // Value is node specific value corresponding to the key. 347 optional string value = 2 [(gogoproto.nullable) = false]; 348 } 349 350 message Version { 351 option (gogoproto.goproto_stringer) = false; 352 353 // The names "major" and "minor" are reserved in C in 354 // some platforms (e.g. FreeBSD). 355 356 optional int32 major_val = 1 [(gogoproto.nullable) = false, (gogoproto.customname) = "Major"]; 357 optional int32 minor_val = 2 [(gogoproto.nullable) = false, (gogoproto.customname) = "Minor"]; 358 // Note that patch is a placeholder and will always be zero. 359 optional int32 patch = 3 [(gogoproto.nullable) = false]; 360 // The unstable version is used to migrate during development. 361 // Users of stable, public releases will only use binaries 362 // with unstable set to 0. 363 optional int32 unstable = 4 [(gogoproto.nullable) = false]; 364 }