github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/batcheval/cmd_add_sstable.go (about)

     1  // Copyright 2017 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package batcheval
    12  
    13  import (
    14  	"context"
    15  
    16  	"github.com/cockroachdb/cockroach/pkg/clusterversion"
    17  	"github.com/cockroachdb/cockroach/pkg/keys"
    18  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval/result"
    19  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
    20  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/spanset"
    21  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    22  	"github.com/cockroachdb/cockroach/pkg/storage"
    23  	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
    24  	"github.com/cockroachdb/cockroach/pkg/util"
    25  	"github.com/cockroachdb/cockroach/pkg/util/log"
    26  	"github.com/cockroachdb/errors"
    27  	"github.com/kr/pretty"
    28  )
    29  
    30  func init() {
    31  	RegisterReadWriteCommand(roachpb.AddSSTable, DefaultDeclareKeys, EvalAddSSTable)
    32  }
    33  
    34  // EvalAddSSTable evaluates an AddSSTable command.
    35  func EvalAddSSTable(
    36  	ctx context.Context, readWriter storage.ReadWriter, cArgs CommandArgs, _ roachpb.Response,
    37  ) (result.Result, error) {
    38  	args := cArgs.Args.(*roachpb.AddSSTableRequest)
    39  	h := cArgs.Header
    40  	ms := cArgs.Stats
    41  	mvccStartKey, mvccEndKey := storage.MVCCKey{Key: args.Key}, storage.MVCCKey{Key: args.EndKey}
    42  
    43  	// TODO(tschottdorf): restore the below in some form (gets in the way of testing).
    44  	// _, span := tracing.ChildSpan(ctx, fmt.Sprintf("AddSSTable [%s,%s)", args.Key, args.EndKey))
    45  	// defer tracing.FinishSpan(span)
    46  	log.Eventf(ctx, "evaluating AddSSTable [%s,%s)", mvccStartKey.Key, mvccEndKey.Key)
    47  
    48  	// IMPORT INTO should not proceed if any KVs from the SST shadow existing data
    49  	// entries - #38044.
    50  	var skippedKVStats enginepb.MVCCStats
    51  	var err error
    52  	if args.DisallowShadowing {
    53  		if skippedKVStats, err = checkForKeyCollisions(ctx, readWriter, mvccStartKey, mvccEndKey, args.Data); err != nil {
    54  			return result.Result{}, errors.Wrap(err, "checking for key collisions")
    55  		}
    56  	}
    57  
    58  	// Verify that the keys in the sstable are within the range specified by the
    59  	// request header, and if the request did not include pre-computed stats,
    60  	// compute the expected MVCC stats delta of ingesting the SST.
    61  	dataIter, err := storage.NewMemSSTIterator(args.Data, true)
    62  	if err != nil {
    63  		return result.Result{}, err
    64  	}
    65  	defer dataIter.Close()
    66  
    67  	// Check that the first key is in the expected range.
    68  	dataIter.SeekGE(storage.MVCCKey{Key: keys.MinKey})
    69  	ok, err := dataIter.Valid()
    70  	if err != nil {
    71  		return result.Result{}, err
    72  	} else if ok {
    73  		if unsafeKey := dataIter.UnsafeKey(); unsafeKey.Less(mvccStartKey) {
    74  			return result.Result{}, errors.Errorf("first key %s not in request range [%s,%s)",
    75  				unsafeKey.Key, mvccStartKey.Key, mvccEndKey.Key)
    76  		}
    77  	}
    78  
    79  	// Get the MVCCStats for the SST being ingested.
    80  	var stats enginepb.MVCCStats
    81  	if args.MVCCStats != nil {
    82  		stats = *args.MVCCStats
    83  	}
    84  
    85  	// Stats are computed on-the-fly when shadowing of keys is disallowed. If we
    86  	// took the fast path and race is enabled, assert the stats were correctly
    87  	// computed.
    88  	verifyFastPath := args.DisallowShadowing && util.RaceEnabled
    89  	if args.MVCCStats == nil || verifyFastPath {
    90  		log.VEventf(ctx, 2, "computing MVCCStats for SSTable [%s,%s)", mvccStartKey.Key, mvccEndKey.Key)
    91  
    92  		computed, err := storage.ComputeStatsGo(
    93  			dataIter, mvccStartKey.Key, mvccEndKey.Key, h.Timestamp.WallTime)
    94  		if err != nil {
    95  			return result.Result{}, errors.Wrap(err, "computing SSTable MVCC stats")
    96  		}
    97  
    98  		if verifyFastPath {
    99  			// Update the timestamp to that of the recently computed stats to get the
   100  			// diff passing.
   101  			stats.LastUpdateNanos = computed.LastUpdateNanos
   102  			if !stats.Equal(computed) {
   103  				log.Fatalf(ctx, "fast-path MVCCStats computation gave wrong result: diff(fast, computed) = %s",
   104  					pretty.Diff(stats, computed))
   105  			}
   106  		}
   107  		stats = computed
   108  	}
   109  
   110  	dataIter.SeekGE(mvccEndKey)
   111  	ok, err = dataIter.Valid()
   112  	if err != nil {
   113  		return result.Result{}, err
   114  	} else if ok {
   115  		return result.Result{}, errors.Errorf("last key %s not in request range [%s,%s)",
   116  			dataIter.UnsafeKey(), mvccStartKey.Key, mvccEndKey.Key)
   117  	}
   118  
   119  	// The above MVCCStats represents what is in this new SST.
   120  	//
   121  	// *If* the keys in the SST do not conflict with keys currently in this range,
   122  	// then adding the stats for this SST to the range stats should yield the
   123  	// correct overall stats.
   124  	//
   125  	// *However*, if the keys in this range *do* overlap with keys already in this
   126  	// range, then adding the SST semantically *replaces*, rather than adds, those
   127  	// keys, and the net effect on the stats is not so simple.
   128  	//
   129  	// To perfectly compute the correct net stats, you could a) determine the
   130  	// stats for the span of the existing range that this SST covers and subtract
   131  	// it from the range's stats, then b) use a merging iterator that reads from
   132  	// the SST and then underlying range and compute the stats of that merged
   133  	// span, and then add those stats back in. That would result in correct stats
   134  	// that reflect the merging semantics when the SST "shadows" an existing key.
   135  	//
   136  	// If the underlying range is mostly empty, this isn't terribly expensive --
   137  	// computing the existing stats to subtract is cheap, as there is little or no
   138  	// existing data to traverse and b) is also pretty cheap -- the merging
   139  	// iterator can quickly iterate the in-memory SST.
   140  	//
   141  	// However, if the underlying range is _not_ empty, then this is not cheap:
   142  	// recomputing its stats involves traversing lots of data, and iterating the
   143  	// merged iterator has to constantly go back and forth to the iterator.
   144  	//
   145  	// If we assume that most SSTs don't shadow too many keys, then the error of
   146  	// simply adding the SST stats directly to the range stats is minimal. In the
   147  	// worst-case, when we retry a whole SST, then it could be overcounting the
   148  	// entire file, but we can hope that that is rare. In the worst case, it may
   149  	// cause splitting an under-filled range that would later merge when the
   150  	// over-count is fixed.
   151  	//
   152  	// We can indicate that these stats contain this estimation using the flag in
   153  	// the MVCC stats so that later re-computations will not be surprised to find
   154  	// any discrepancies.
   155  	//
   156  	// Callers can trigger such a re-computation to fixup any discrepancies (and
   157  	// remove the ContainsEstimates flag) after they are done ingesting files by
   158  	// sending an explicit recompute.
   159  	//
   160  	// There is a significant performance win to be achieved by ensuring that the
   161  	// stats computed are not estimates as it prevents recomputation on splits.
   162  	// Running AddSSTable with disallowShadowing=true gets us close to this as we
   163  	// do not allow colliding keys to be ingested. However, in the situation that
   164  	// two SSTs have KV(s) which "perfectly" shadow an existing key (equal ts and
   165  	// value), we do not consider this a collision. While the KV would just
   166  	// overwrite the existing data, the stats would be added to the cumulative
   167  	// stats of the AddSSTable command, causing a double count for such KVs.
   168  	// Therefore, we compute the stats for these "skipped" KVs on-the-fly while
   169  	// checking for the collision condition in C++ and subtract them from the
   170  	// stats of the SST being ingested before adding them to the running
   171  	// cumulative for this command. These stats can then be marked as accurate.
   172  	if args.DisallowShadowing {
   173  		stats.Subtract(skippedKVStats)
   174  		stats.ContainsEstimates = 0
   175  	} else {
   176  		_ = clusterversion.VersionContainsEstimatesCounter // see for info on ContainsEstimates migration
   177  		stats.ContainsEstimates++
   178  	}
   179  
   180  	ms.Add(stats)
   181  
   182  	if args.IngestAsWrites {
   183  		log.VEventf(ctx, 2, "ingesting SST (%d keys/%d bytes) via regular write batch", stats.KeyCount, len(args.Data))
   184  		dataIter.SeekGE(storage.MVCCKey{Key: keys.MinKey})
   185  		for {
   186  			ok, err := dataIter.Valid()
   187  			if err != nil {
   188  				return result.Result{}, err
   189  			} else if !ok {
   190  				break
   191  			}
   192  			// NB: This is *not* a general transformation of any arbitrary SST to a
   193  			// WriteBatch: it assumes every key in the SST is a simple Set. This is
   194  			// already assumed elsewhere in this RPC though, so that's OK here.
   195  			if err := readWriter.Put(dataIter.UnsafeKey(), dataIter.UnsafeValue()); err != nil {
   196  				return result.Result{}, err
   197  			}
   198  			dataIter.Next()
   199  		}
   200  		return result.Result{}, nil
   201  	}
   202  
   203  	return result.Result{
   204  		Replicated: kvserverpb.ReplicatedEvalResult{
   205  			AddSSTable: &kvserverpb.ReplicatedEvalResult_AddSSTable{
   206  				Data:  args.Data,
   207  				CRC32: util.CRC32(args.Data),
   208  			},
   209  		},
   210  	}, nil
   211  }
   212  
   213  func checkForKeyCollisions(
   214  	_ context.Context,
   215  	readWriter storage.ReadWriter,
   216  	mvccStartKey storage.MVCCKey,
   217  	mvccEndKey storage.MVCCKey,
   218  	data []byte,
   219  ) (enginepb.MVCCStats, error) {
   220  	// We could get a spansetBatch so fetch the underlying db engine as
   221  	// we need access to the underlying C.DBIterator later, and the
   222  	// dbIteratorGetter is not implemented by a spansetBatch.
   223  	dbEngine := spanset.GetDBEngine(readWriter, roachpb.Span{Key: mvccStartKey.Key, EndKey: mvccEndKey.Key})
   224  
   225  	emptyMVCCStats := enginepb.MVCCStats{}
   226  
   227  	// Create iterator over the existing data.
   228  	existingDataIter := dbEngine.NewIterator(storage.IterOptions{UpperBound: mvccEndKey.Key})
   229  	defer existingDataIter.Close()
   230  	existingDataIter.SeekGE(mvccStartKey)
   231  	if ok, err := existingDataIter.Valid(); err != nil {
   232  		return emptyMVCCStats, errors.Wrap(err, "checking for key collisions")
   233  	} else if !ok {
   234  		// Target key range is empty, so it is safe to ingest.
   235  		return emptyMVCCStats, nil
   236  	}
   237  
   238  	return existingDataIter.CheckForKeyCollisions(data, mvccStartKey.Key, mvccEndKey.Key)
   239  }