github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/partition_utils.go (about)

     1  // Copyright 2017 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package sql
    12  
    13  import (
    14  	"bytes"
    15  
    16  	"github.com/cockroachdb/cockroach/pkg/base"
    17  	"github.com/cockroachdb/cockroach/pkg/config/zonepb"
    18  	"github.com/cockroachdb/cockroach/pkg/keys"
    19  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    20  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    21  	"github.com/cockroachdb/cockroach/pkg/sql/covering"
    22  	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
    23  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    24  	"github.com/cockroachdb/cockroach/pkg/util/uuid"
    25  )
    26  
    27  // GenerateSubzoneSpans constructs from a TableDescriptor the entries mapping
    28  // zone config spans to subzones for use in the SubzoneSpans field of
    29  // zonepb.ZoneConfig. SubzoneSpans controls which splits are created, so only
    30  // the spans corresponding to entries in subzones are returned.
    31  //
    32  // Zone configs target indexes and partitions via `subzones`, which are attached
    33  // to a table-scoped row in `system.zones`. Each subzone represents one index
    34  // (primary or secondary) or one partition (or subpartition) and contains the
    35  // usual zone config constraints. They are saved to `system.zones` sparsely
    36  // (only when set by a user) and are the most specific entry in the normal
    37  // cluster-default/database/table/subzone config hierarchy.
    38  //
    39  // Each non-interleaved index and partition can be mapped to spans in the
    40  // keyspace. Indexes and range partitions each map to one span, while each list
    41  // partition maps to one or more spans. Each partition span is contained by some
    42  // index span and each subpartition span is contained by one of its parent
    43  // partition's spans. The spans for a given level of a range partitioning
    44  // (corresponding to one `PARTITION BY` in sql or one `PartitionDescriptor`) are
    45  // disjoint, but the spans for a given level of a list partitioning may overlap
    46  // if DEFAULT is used. A list partitioning which includes both (1, DEFAULT) and
    47  // (1, 2) will overlap with the latter getting precedence in the zone config
    48  // hierarchy. NB: In a valid PartitionDescriptor, no partitions with the same
    49  // number of DEFAULTs will overlap (this property is used by
    50  // `indexCoveringsForPartitioning`).
    51  //
    52  // These subzone spans are kept denormalized to the relevant `system.zone` row
    53  // for performance. Given a TableDescriptor, the spans for every
    54  // index/partition/subpartition are created, filtered out if they don't have a
    55  // config set for them, and precedence applied (via `OverlapCoveringMerge`) to
    56  // produce a set of non-overlapping spans, which each map to a subzone. There
    57  // may be "holes" (uncovered spans) in this set.
    58  //
    59  // The returned spans are returned in exactly the format required by
    60  // `system.zones`. They must be sorted and non-overlapping. Each contains an
    61  // IndexID, which maps to one of the input `subzones` by indexing into the
    62  // slice. As space optimizations, all `Key`s and `EndKey`s of `SubzoneSpan` omit
    63  // the common prefix (the encoded table ID) and if `EndKey` is equal to
    64  // `Key.PrefixEnd()` it is omitted.
    65  //
    66  // This function has tests in the partitionccl package.
    67  //
    68  // TODO(benesch): remove the hasNewSubzones parameter when a statement to clear
    69  // all subzones at once is introduced.
    70  func GenerateSubzoneSpans(
    71  	st *cluster.Settings,
    72  	clusterID uuid.UUID,
    73  	codec keys.SQLCodec,
    74  	tableDesc *sqlbase.TableDescriptor,
    75  	subzones []zonepb.Subzone,
    76  	hasNewSubzones bool,
    77  ) ([]zonepb.SubzoneSpan, error) {
    78  	// Removing zone configs does not require a valid license.
    79  	if hasNewSubzones {
    80  		org := ClusterOrganization.Get(&st.SV)
    81  		if err := base.CheckEnterpriseEnabled(st, clusterID, org,
    82  			"replication zones on indexes or partitions"); err != nil {
    83  			return nil, err
    84  		}
    85  	}
    86  
    87  	a := &sqlbase.DatumAlloc{}
    88  
    89  	subzoneIndexByIndexID := make(map[sqlbase.IndexID]int32)
    90  	subzoneIndexByPartition := make(map[string]int32)
    91  	for i, subzone := range subzones {
    92  		if len(subzone.PartitionName) > 0 {
    93  			subzoneIndexByPartition[subzone.PartitionName] = int32(i)
    94  		} else {
    95  			subzoneIndexByIndexID[sqlbase.IndexID(subzone.IndexID)] = int32(i)
    96  		}
    97  	}
    98  
    99  	var indexCovering covering.Covering
   100  	var partitionCoverings []covering.Covering
   101  	if err := tableDesc.ForeachNonDropIndex(func(idxDesc *sqlbase.IndexDescriptor) error {
   102  		_, indexSubzoneExists := subzoneIndexByIndexID[idxDesc.ID]
   103  		if indexSubzoneExists {
   104  			idxSpan := tableDesc.IndexSpan(codec, idxDesc.ID)
   105  			// Each index starts with a unique prefix, so (from a precedence
   106  			// perspective) it's safe to append them all together.
   107  			indexCovering = append(indexCovering, covering.Range{
   108  				Start: idxSpan.Key, End: idxSpan.EndKey,
   109  				Payload: zonepb.Subzone{IndexID: uint32(idxDesc.ID)},
   110  			})
   111  		}
   112  
   113  		var emptyPrefix []tree.Datum
   114  		indexPartitionCoverings, err := indexCoveringsForPartitioning(
   115  			a, codec, tableDesc, idxDesc, &idxDesc.Partitioning, subzoneIndexByPartition, emptyPrefix)
   116  		if err != nil {
   117  			return err
   118  		}
   119  		// The returned indexPartitionCoverings are sorted with highest
   120  		// precedence first. They all start with the index prefix, so cannot
   121  		// overlap with the partition coverings for any other index, so (from a
   122  		// precedence perspective) it's safe to append them all together.
   123  		partitionCoverings = append(partitionCoverings, indexPartitionCoverings...)
   124  
   125  		return nil
   126  	}); err != nil {
   127  		return nil, err
   128  	}
   129  
   130  	// OverlapCoveringMerge returns the payloads for any coverings that overlap
   131  	// in the same order they were input. So, we require that they be ordered
   132  	// with highest precedence first, so the first payload of each range is the
   133  	// one we need.
   134  	ranges := covering.OverlapCoveringMerge(append(partitionCoverings, indexCovering))
   135  
   136  	// NB: This assumes that none of the indexes are interleaved, which is
   137  	// checked in PartitionDescriptor validation.
   138  	sharedPrefix := codec.TablePrefix(uint32(tableDesc.ID))
   139  
   140  	var subzoneSpans []zonepb.SubzoneSpan
   141  	for _, r := range ranges {
   142  		payloads := r.Payload.([]interface{})
   143  		if len(payloads) == 0 {
   144  			continue
   145  		}
   146  		subzoneSpan := zonepb.SubzoneSpan{
   147  			Key:    bytes.TrimPrefix(r.Start, sharedPrefix),
   148  			EndKey: bytes.TrimPrefix(r.End, sharedPrefix),
   149  		}
   150  		var ok bool
   151  		if subzone := payloads[0].(zonepb.Subzone); len(subzone.PartitionName) > 0 {
   152  			subzoneSpan.SubzoneIndex, ok = subzoneIndexByPartition[subzone.PartitionName]
   153  		} else {
   154  			subzoneSpan.SubzoneIndex, ok = subzoneIndexByIndexID[sqlbase.IndexID(subzone.IndexID)]
   155  		}
   156  		if !ok {
   157  			continue
   158  		}
   159  		if bytes.Equal(subzoneSpan.Key.PrefixEnd(), subzoneSpan.EndKey) {
   160  			subzoneSpan.EndKey = nil
   161  		}
   162  		subzoneSpans = append(subzoneSpans, subzoneSpan)
   163  	}
   164  	return subzoneSpans, nil
   165  }
   166  
   167  // indexCoveringsForPartitioning returns span coverings representing the
   168  // partitions in partDesc (including subpartitions). They are sorted with
   169  // highest precedence first and the interval.Range payloads are each a
   170  // `zonepb.Subzone` with the PartitionName set.
   171  func indexCoveringsForPartitioning(
   172  	a *sqlbase.DatumAlloc,
   173  	codec keys.SQLCodec,
   174  	tableDesc *sqlbase.TableDescriptor,
   175  	idxDesc *sqlbase.IndexDescriptor,
   176  	partDesc *sqlbase.PartitioningDescriptor,
   177  	relevantPartitions map[string]int32,
   178  	prefixDatums []tree.Datum,
   179  ) ([]covering.Covering, error) {
   180  	if partDesc.NumColumns == 0 {
   181  		return nil, nil
   182  	}
   183  
   184  	var coverings []covering.Covering
   185  	var descendentCoverings []covering.Covering
   186  
   187  	if len(partDesc.List) > 0 {
   188  		// The returned spans are required to be ordered with highest precedence
   189  		// first. The span for (1, DEFAULT) overlaps with (1, 2) and needs to be
   190  		// returned at a lower precedence. Luckily, because of the partitioning
   191  		// validation, we're guaranteed that all entries in a list partitioning
   192  		// with the same number of DEFAULTs are non-overlapping. So, bucket the
   193  		// `interval.Range`s by the number of non-DEFAULT columns and return
   194  		// them ordered from least # of DEFAULTs to most.
   195  		listCoverings := make([]covering.Covering, int(partDesc.NumColumns)+1)
   196  		for _, p := range partDesc.List {
   197  			for _, valueEncBuf := range p.Values {
   198  				t, keyPrefix, err := sqlbase.DecodePartitionTuple(
   199  					a, codec, tableDesc, idxDesc, partDesc, valueEncBuf, prefixDatums)
   200  				if err != nil {
   201  					return nil, err
   202  				}
   203  				if _, ok := relevantPartitions[p.Name]; ok {
   204  					listCoverings[len(t.Datums)] = append(listCoverings[len(t.Datums)], covering.Range{
   205  						Start: keyPrefix, End: roachpb.Key(keyPrefix).PrefixEnd(),
   206  						Payload: zonepb.Subzone{PartitionName: p.Name},
   207  					})
   208  				}
   209  				newPrefixDatums := append(prefixDatums, t.Datums...)
   210  				subpartitionCoverings, err := indexCoveringsForPartitioning(
   211  					a, codec, tableDesc, idxDesc, &p.Subpartitioning, relevantPartitions, newPrefixDatums)
   212  				if err != nil {
   213  					return nil, err
   214  				}
   215  				descendentCoverings = append(descendentCoverings, subpartitionCoverings...)
   216  			}
   217  		}
   218  		for i := range listCoverings {
   219  			if covering := listCoverings[len(listCoverings)-i-1]; len(covering) > 0 {
   220  				coverings = append(coverings, covering)
   221  			}
   222  		}
   223  	}
   224  
   225  	if len(partDesc.Range) > 0 {
   226  		for _, p := range partDesc.Range {
   227  			if _, ok := relevantPartitions[p.Name]; !ok {
   228  				continue
   229  			}
   230  			_, fromKey, err := sqlbase.DecodePartitionTuple(
   231  				a, codec, tableDesc, idxDesc, partDesc, p.FromInclusive, prefixDatums)
   232  			if err != nil {
   233  				return nil, err
   234  			}
   235  			_, toKey, err := sqlbase.DecodePartitionTuple(
   236  				a, codec, tableDesc, idxDesc, partDesc, p.ToExclusive, prefixDatums)
   237  			if err != nil {
   238  				return nil, err
   239  			}
   240  			if _, ok := relevantPartitions[p.Name]; ok {
   241  				coverings = append(coverings, covering.Covering{{
   242  					Start: fromKey, End: toKey,
   243  					Payload: zonepb.Subzone{PartitionName: p.Name},
   244  				}})
   245  			}
   246  		}
   247  	}
   248  
   249  	// descendentCoverings are from subpartitions and so get precedence; append
   250  	// them to the front.
   251  	return append(descendentCoverings, coverings...), nil
   252  }