github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/keys/doc.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  // Package keys manages the construction of keys for CockroachDB's key-value
    12  // layer.
    13  //
    14  // The keys package is necessarily tightly coupled to the storage package. In
    15  // theory, it is oblivious to higher levels of the stack. In practice, it
    16  // exposes several functions that blur abstraction boundaries to break
    17  // dependency cycles. For example, EnsureSafeSplitKey knows far too much about
    18  // how to decode SQL keys.
    19  //
    20  // 1. Overview
    21  //
    22  // This is the ten-thousand foot view of the keyspace:
    23  //
    24  //    +------------------+
    25  //    | (empty)          | /Min
    26  //    | \x01...          | /Local            ---------------------+
    27  //    |                  |                                        |
    28  //    | ...              |                                        | local keys
    29  //    |                  |                                        |
    30  //    |                  |                   ---------------------+
    31  //    |                  |                   ---------------------+
    32  //    | \x02...          | /Meta1            ----+                |
    33  //    | \x03...          | /Meta2                |                |
    34  //    | \x04...          | /System               |                |
    35  //    |                  |                       | system keys    |
    36  //    | ...              |                       |                |
    37  //    |                  |                   ----+                |
    38  //    | \x89...          | /Table/1          ----+                |
    39  //    | \x8a...          | /Table/2              |                |
    40  //    |                  |                       | system tenant  |
    41  //    | ...              |                       |                | global keys
    42  //    |                  |                   ----+                |
    43  //    | \xfe\x8a\x89...  | /Tenant/2/Table/1 ----+                |
    44  //    | \xfe\x8a\x8a...  | /Tenant/2/Table/2     |                |
    45  //    |                  |                       | tenant 2       |
    46  //    | ...              |                       |                |
    47  //    |                  |                   ----+                |
    48  //    | \xfe...          | /Tenant/...       ----+                |
    49  //    | \xfe...          |                       |                |
    50  //    |                  |                       | tenant ...     |
    51  //    | ...              |                       |                |
    52  //    |                  |                   ----+                |
    53  //    | \xff\xff         | /Max              ---------------------+
    54  //    +------------------+
    55  //
    56  // When keys are pretty printed, the logical name to the right of the table is
    57  // shown instead of the raw byte sequence.
    58  //
    59  //
    60  // 1. Key Ranges
    61  //
    62  // The keyspace is divided into contiguous, non-overlapping chunks called
    63  // "ranges." A range is defined by its start and end keys. For example, a range
    64  // might span from [/Table/1, /Table/2), where the lower bound is inclusive and
    65  // the upper bound is exclusive. Any key that begins with /Table/1, like
    66  // /Table/1/SomePrimaryKeyValue..., would belong to this range. Key ranges
    67  // exist over the "resolved" keyspace, refer to the "Key Addressing" section
    68  // below for more details.
    69  //
    70  //
    71  // 2. Local vs. Global Keys
    72  //
    73  // There are broadly two types of keys, "local" and "global":
    74  //
    75  //  (i) Local keys, such as store- and range-specific metadata, are keys that
    76  //  must be physically collocated with the store and/or ranges they refer to but
    77  //  also logically separated so that they do not pollute the user key space.
    78  //  This is further elaborated on in the "Key Addressing" section below. Local
    79  //  data also includes data "local" to a node, such as the store metadata and
    80  //  the raft log, which is where the name originated.
    81  //
    82  //  (ii) Non-local keys (for e.g. meta1, meta2, system, and SQL keys) are
    83  //  collectively referred to as "global" keys.
    84  //
    85  // NB: The empty key (/Min) is a special case. No data is stored there, but it
    86  // is used as the start key of the first range descriptor and as the starting
    87  // point for some scans, in which case it acts like a global key.
    88  //
    89  // (Check `keymap` below for a more precise breakdown of the local and global
    90  // keyspace.)
    91  //
    92  //
    93  // 2. Key Addressing
    94  //
    95  // We also have this concept of the "address" for a key. Keys get "resolved"
    96  // using `keys.Addr`, through which we're able to lookup the range "containing"
    97  // the key. For global keys, the resolved key is the key itself.
    98  //
    99  // Local keys are special. For certain kinds of local keys (namely, addressable
   100  // ones), the resolved key is obtained by stripping out the local key prefix,
   101  // suffix, and optional details (refer to `keymap` below to understand how local
   102  // keys are constructed). This level of indirection was introduced so that we
   103  // could logically sort these local keys into a range other than what a
   104  // strictly physical key based sort would entail. For example, the key
   105  // /Local/Range/Table/1 would naturally sort into the range [/Min, /System), but
   106  // its "address" is /Table/1, so it actually belongs to a range like [/Table1,
   107  // /Table/2).
   108  //
   109  // Consider the motivating example: we want to store a copy of the range
   110  // descriptor in a key that's both (a) a part of the range, and (b) does not
   111  // require us to remove a portion of the keyspace from the user (say by
   112  // reserving some key suffix). Storing this information in the global keyspace
   113  // would place the data on an arbitrary set of stores, with no guarantee of
   114  // collocation. By being able to logically sort the range descriptor key next to
   115  // the range itself, we're able to collocate the two.
   116  //
   117  //
   118  // 3. (replicated) Range-ID local keys vs. Range local keys
   119  //
   120  // Deciding between replicated range-ID local keys and range local keys is not
   121  // entirely straightforward, as the two key types serve similar purposes.
   122  // Range-ID keys, as the name suggests, use the range-ID in the key. Range local
   123  // keys instead use a key within the range bounds. Range-ID keys are not
   124  // addressable whereas range-local keys are. Note that only addressable keys can
   125  // be the target of KV operations, unaddressable keys can only be written as a
   126  // side-effect of other KV operations. This can often makes the choice between
   127  // the two clear (range descriptor keys needing to be addressable, and therefore
   128  // being a range local key is one example of this).
   129  //
   130  // The "behavioral" difference between range local keys and range-id local keys
   131  // is that range local keys split and merge along range boundaries while
   132  // range-id local keys don't. We want to move as little data as possible during
   133  // splits and merges (in fact, we don't re-write any data during splits), and
   134  // that generally determines which data sits where. If we want the split point
   135  // of a range to dictate where certain keys end up, then they're likely meant to
   136  // be range local keys. If not, they're meant to be range-ID local keys. Any key
   137  // we need to re-write during splits/merges will needs to go through Raft. We
   138  // have limits set on the size of Raft proposals so we generally don’t want to
   139  // be re-writing lots of data.
   140  //
   141  // This naturally leads to range-id local keys being used to store metadata
   142  // about a specific Range and range local keys being used to store metadata
   143  // about specific "global" keys. Let us consider transaction record keys for
   144  // example (ignoring for a second we also need them to be addressable). Hot
   145  // ranges could potentially have lots of transaction keys. Keys destined for the
   146  // RHS of the split need to be collocated with the RHS range. By categorizing
   147  // them as as range local keys, we avoid needing to re-write them during splits
   148  // as they automatically sort into the new range boundaries. If they were
   149  // range-ID local keys, we'd have to update each transaction key with the new
   150  // range ID.
   151  package keys
   152  
   153  // NB: The sorting order of the symbols below map to the physical layout.
   154  // Preserve group-wise ordering when adding new constants.
   155  var _ = [...]interface{}{
   156  	MinKey,
   157  
   158  	// There are four types of local key data enumerated below: replicated
   159  	// range-ID, unreplicated range-ID, range local, and store-local keys.
   160  	// Local keys are constructed using a prefix, an optional infix, and a
   161  	// suffix. The prefix and infix are used to disambiguate between the four
   162  	// types of local keys listed above, and determines inter-group ordering.
   163  	// The string comment next to each symbol below is the suffix pertaining to
   164  	// the corresponding key (and determines intra-group ordering).
   165  	// 	  - RangeID replicated keys all share `LocalRangeIDPrefix` and
   166  	// 		`LocalRangeIDReplicatedInfix`.
   167  	// 	  - RangeID unreplicated keys all share `LocalRangeIDPrefix` and
   168  	// 		`localRangeIDUnreplicatedInfix`.
   169  	// 	  - Range local keys all share `LocalRangePrefix`.
   170  	//	  - Store keys all share `localStorePrefix`.
   171  	//
   172  	// `LocalRangeIDPrefix`, `localRangePrefix` and `localStorePrefix` all in
   173  	// turn share `localPrefix`. `localPrefix` was chosen arbitrarily. Local
   174  	// keys would work just as well with a different prefix, like 0xff, or even
   175  	// with a suffix.
   176  
   177  	//   1. Replicated range-ID local keys: These store metadata pertaining to a
   178  	//   range as a whole. Though they are replicated, they are unaddressable.
   179  	//   Typical examples are MVCC stats and the abort span. They all share
   180  	//   `LocalRangeIDPrefix` and `LocalRangeIDReplicatedInfix`.
   181  	AbortSpanKey,                // "abc-"
   182  	RangeLastGCKey,              // "lgc-"
   183  	RangeAppliedStateKey,        // "rask"
   184  	RaftAppliedIndexLegacyKey,   // "rfta"
   185  	RaftTruncatedStateLegacyKey, // "rftt"
   186  	RangeLeaseKey,               // "rll-"
   187  	LeaseAppliedIndexLegacyKey,  // "rlla"
   188  	RangeStatsLegacyKey,         // "stat"
   189  
   190  	//   2. Unreplicated range-ID local keys: These contain metadata that
   191  	//   pertain to just one replica of a range. They are unreplicated and
   192  	//   unaddressable. The typical example is the Raft log. They all share
   193  	//   `LocalRangeIDPrefix` and `localRangeIDUnreplicatedInfix`.
   194  	RangeTombstoneKey,              // "rftb"
   195  	RaftHardStateKey,               // "rfth"
   196  	RaftLogKey,                     // "rftl"
   197  	RaftTruncatedStateKey,          // "rftt"
   198  	RangeLastReplicaGCTimestampKey, // "rlrt"
   199  
   200  	//   3. Range local keys: These also store metadata that pertains to a range
   201  	//   as a whole. They are replicated and addressable. Typical examples are
   202  	//   the range descriptor and transaction records. They all share
   203  	//   `LocalRangePrefix`.
   204  	QueueLastProcessedKey,   // "qlpt"
   205  	RangeDescriptorJointKey, // "rdjt"
   206  	RangeDescriptorKey,      // "rdsc"
   207  	TransactionKey,          // "txn-"
   208  
   209  	//   4. Store local keys: These contain metadata about an individual store.
   210  	//   They are unreplicated and unaddressable. The typical example is the
   211  	//   store 'ident' record. They all share `localStorePrefix`.
   212  	StoreSuggestedCompactionKey, // "comp"
   213  	StoreClusterVersionKey,      // "cver"
   214  	StoreGossipKey,              // "goss"
   215  	StoreHLCUpperBoundKey,       // "hlcu"
   216  	StoreIdentKey,               // "iden"
   217  	StoreLastUpKey,              // "uptm"
   218  
   219  	// The global keyspace includes the meta{1,2}, system, system tenant SQL
   220  	// keys, and non-system tenant SQL keys.
   221  	//
   222  	// 	1. Meta keys: This is where we store all key addressing data.
   223  	MetaMin,
   224  	Meta1Prefix,
   225  	Meta2Prefix,
   226  	MetaMax,
   227  
   228  	// 	2. System keys: This is where we store global, system data which is
   229  	// 	replicated across the cluster.
   230  	SystemPrefix,
   231  	NodeLivenessPrefix,  // "\x00liveness-"
   232  	BootstrapVersionKey, // "bootstrap-version"
   233  	descIDGenerator,     // "desc-idgen"
   234  	NodeIDGenerator,     // "node-idgen"
   235  	RangeIDGenerator,    // "range-idgen"
   236  	StatusPrefix,        // "status-"
   237  	StatusNodePrefix,    // "status-node-"
   238  	StoreIDGenerator,    // "store-idgen"
   239  	MigrationPrefix,     // "system-version/"
   240  	MigrationLease,      // "system-version/lease"
   241  	TimeseriesPrefix,    // "tsd"
   242  	SystemMax,
   243  
   244  	// 	3. System tenant SQL keys: This is where we store all system-tenant
   245  	// 	table data.
   246  	TableDataMin,
   247  	NamespaceTableMin,
   248  	UserTableDataMin,
   249  	TableDataMax,
   250  
   251  	//  4. Non-system tenant SQL keys: This is where we store all non-system
   252  	//  tenant table data.
   253  	TenantTableDataMin,
   254  	TenantTableDataMax,
   255  
   256  	MaxKey,
   257  }
   258  
   259  // Unused, deprecated keys.
   260  var _ = [...]interface{}{
   261  	localRaftLastIndexSuffix,
   262  	localRangeFrozenStatusSuffix,
   263  	localRangeLastVerificationTimestampSuffix,
   264  	localRemovedLeakedRaftEntriesSuffix,
   265  	localTxnSpanGCThresholdSuffix,
   266  }