github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/row/fetcher.go (about)

     1  // Copyright 2017 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package row
    12  
    13  import (
    14  	"bytes"
    15  	"context"
    16  	"fmt"
    17  	"strings"
    18  	"time"
    19  
    20  	"github.com/cockroachdb/cockroach/pkg/keys"
    21  	"github.com/cockroachdb/cockroach/pkg/kv"
    22  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    23  	"github.com/cockroachdb/cockroach/pkg/sql/scrub"
    24  	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
    25  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    26  	"github.com/cockroachdb/cockroach/pkg/sql/types"
    27  	"github.com/cockroachdb/cockroach/pkg/util"
    28  	"github.com/cockroachdb/cockroach/pkg/util/encoding"
    29  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    30  	"github.com/cockroachdb/cockroach/pkg/util/log"
    31  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    32  	"github.com/cockroachdb/errors"
    33  )
    34  
    35  // DebugRowFetch can be used to turn on some low-level debugging logs. We use
    36  // this to avoid using log.V in the hot path.
    37  const DebugRowFetch = false
    38  
    39  type kvBatchFetcher interface {
    40  	// nextBatch returns the next batch of rows. Returns false in the first
    41  	// parameter if there are no more keys in the scan. May return either a slice
    42  	// of KeyValues or a batchResponse, numKvs pair, depending on the server
    43  	// version - both must be handled by calling code.
    44  	nextBatch(ctx context.Context) (ok bool, kvs []roachpb.KeyValue,
    45  		batchResponse []byte, origSpan roachpb.Span, err error)
    46  	GetRangesInfo() []roachpb.RangeInfo
    47  }
    48  
    49  type tableInfo struct {
    50  	// -- Fields initialized once --
    51  
    52  	// Used to determine whether a key retrieved belongs to the span we
    53  	// want to scan.
    54  	spans            roachpb.Spans
    55  	desc             *sqlbase.ImmutableTableDescriptor
    56  	index            *sqlbase.IndexDescriptor
    57  	isSecondaryIndex bool
    58  	indexColumnDirs  []sqlbase.IndexDescriptor_Direction
    59  	// equivSignature is an equivalence class for each unique table-index
    60  	// pair. It allows us to check if an index key belongs to a given
    61  	// table-index.
    62  	equivSignature []byte
    63  
    64  	// The table columns to use for fetching, possibly including ones currently in
    65  	// schema changes.
    66  	cols []sqlbase.ColumnDescriptor
    67  
    68  	// The set of ColumnIDs that are required.
    69  	neededCols util.FastIntSet
    70  
    71  	// The set of indexes into the cols array that are required for columns
    72  	// in the value part.
    73  	neededValueColsByIdx util.FastIntSet
    74  
    75  	// The number of needed columns from the value part of the row. Once we've
    76  	// seen this number of value columns for a particular row, we can stop
    77  	// decoding values in that row.
    78  	neededValueCols int
    79  
    80  	// Map used to get the index for columns in cols.
    81  	colIdxMap map[sqlbase.ColumnID]int
    82  
    83  	// One value per column that is part of the key; each value is a column
    84  	// index (into cols); -1 if we don't need the value for that column.
    85  	indexColIdx []int
    86  
    87  	// knownPrefixLength is the number of bytes in the index key prefix this
    88  	// Fetcher is configured for. The index key prefix is the table id, index
    89  	// id pair at the start of the key.
    90  	knownPrefixLength int
    91  
    92  	// -- Fields updated during a scan --
    93  
    94  	keyValTypes []*types.T
    95  	extraTypes  []*types.T
    96  	keyVals     []sqlbase.EncDatum
    97  	extraVals   []sqlbase.EncDatum
    98  	row         sqlbase.EncDatumRow
    99  	decodedRow  tree.Datums
   100  
   101  	// The following fields contain MVCC metadata for each row and may be
   102  	// returned to users of Fetcher immediately after NextRow returns.
   103  	// They're not important to ordinary consumers of Fetcher that only
   104  	// concern themselves with actual SQL row data.
   105  	//
   106  	// rowLastModified is the timestamp of the last time any family in the row
   107  	// was modified in any way.
   108  	rowLastModified hlc.Timestamp
   109  	// rowIsDeleted is true when the row has been deleted. This is only
   110  	// meaningful when kv deletion tombstones are returned by the kvBatchFetcher,
   111  	// which the one used by `StartScan` (the common case) doesnt. Notably,
   112  	// changefeeds use this by providing raw kvs with tombstones unfiltered via
   113  	// `StartScanFrom`.
   114  	rowIsDeleted bool
   115  
   116  	// hasLast indicates whether there was a previously scanned k/v.
   117  	hasLast bool
   118  	// lastDatums is a buffer for the current key. It is only present when
   119  	// doing a physical check in order to verify round-trip encoding.
   120  	// It is required because Fetcher.kv is overwritten before NextRow
   121  	// returns.
   122  	lastKV roachpb.KeyValue
   123  	// lastDatums is a buffer for the previously scanned k/v datums. It is
   124  	// only present when doing a physical check in order to verify
   125  	// ordering.
   126  	lastDatums tree.Datums
   127  }
   128  
   129  // FetcherTableArgs are the arguments passed to Fetcher.Init
   130  // for a given table that includes descriptors and row information.
   131  type FetcherTableArgs struct {
   132  	// The spans of keys to return for the given table. Fetcher
   133  	// ignores keys outside these spans.
   134  	// This is irrelevant if Fetcher is initialize with only one
   135  	// table.
   136  	Spans            roachpb.Spans
   137  	Desc             *sqlbase.ImmutableTableDescriptor
   138  	Index            *sqlbase.IndexDescriptor
   139  	ColIdxMap        map[sqlbase.ColumnID]int
   140  	IsSecondaryIndex bool
   141  	Cols             []sqlbase.ColumnDescriptor
   142  	// The indexes (0 to # of columns - 1) of the columns to return.
   143  	ValNeededForCol util.FastIntSet
   144  }
   145  
   146  // Fetcher handles fetching kvs and forming table rows for an
   147  // arbitrary number of tables.
   148  // Usage:
   149  //   var rf Fetcher
   150  //   err := rf.Init(..)
   151  //   // Handle err
   152  //   err := rf.StartScan(..)
   153  //   // Handle err
   154  //   for {
   155  //      res, err := rf.NextRow()
   156  //      // Handle err
   157  //      if res.row == nil {
   158  //         // Done
   159  //         break
   160  //      }
   161  //      // Process res.row
   162  //   }
   163  type Fetcher struct {
   164  	// codec is used to encode and decode sql keys.
   165  	codec keys.SQLCodec
   166  
   167  	// tables is a slice of all the tables and their descriptors for which
   168  	// rows are returned.
   169  	tables []tableInfo
   170  
   171  	// allEquivSignatures is a map used for checking if an equivalence
   172  	// signature belongs to any table or table's ancestor. It also maps the
   173  	// string representation of every table's and every table's ancestors'
   174  	// signature to the table's index in 'tables' for lookup during decoding.
   175  	// If 2+ tables share the same ancestor signature, allEquivSignatures
   176  	// will map the signature to the largest 'tables' index.
   177  	// The full signature for a given table in 'tables' will always map to
   178  	// its own index in 'tables'.
   179  	allEquivSignatures map[string]int
   180  
   181  	// reverse denotes whether or not the spans should be read in reverse
   182  	// or not when StartScan is invoked.
   183  	reverse bool
   184  
   185  	// maxKeysPerRow memoizes the maximum number of keys per row
   186  	// out of all the tables. This is used to calculate the kvBatchFetcher's
   187  	// firstBatchLimit.
   188  	maxKeysPerRow int
   189  
   190  	// True if the index key must be decoded.
   191  	// If there is more than one table, the index key must always be decoded.
   192  	// This is only false if there are no needed columns and the (single)
   193  	// table has no interleave children.
   194  	mustDecodeIndexKey bool
   195  
   196  	// lockStr represents the row-level locking mode to use when fetching rows.
   197  	lockStr sqlbase.ScanLockingStrength
   198  
   199  	// returnRangeInfo, if set, causes the underlying kvBatchFetcher to return
   200  	// information about the ranges descriptors/leases uses in servicing the
   201  	// requests. This has some cost, so it's only enabled by DistSQL when this
   202  	// info is actually useful for correcting the plan (e.g. not for the PK-side
   203  	// of an index-join).
   204  	// If set, GetRangesInfo() can be used to retrieve the accumulated info.
   205  	returnRangeInfo bool
   206  
   207  	// traceKV indicates whether or not session tracing is enabled. It is set
   208  	// when beginning a new scan.
   209  	traceKV bool
   210  
   211  	// -- Fields updated during a scan --
   212  
   213  	kvFetcher      *KVFetcher
   214  	indexKey       []byte // the index key of the current row
   215  	prettyValueBuf *bytes.Buffer
   216  
   217  	valueColsFound int // how many needed cols we've found so far in the value
   218  
   219  	rowReadyTable *tableInfo // the table for which a row was fully decoded and ready for output
   220  	currentTable  *tableInfo // the most recent table for which a key was decoded
   221  	keySigBuf     []byte     // buffer for the index key's signature
   222  	keyRestBuf    []byte     // buffer for the rest of the index key that is not part of the signature
   223  
   224  	// The current key/value, unless kvEnd is true.
   225  	kv                roachpb.KeyValue
   226  	keyRemainingBytes []byte
   227  	kvEnd             bool
   228  
   229  	// isCheck indicates whether or not we are running checks for k/v
   230  	// correctness. It is set only during SCRUB commands.
   231  	isCheck bool
   232  
   233  	// Buffered allocation of decoded datums.
   234  	alloc *sqlbase.DatumAlloc
   235  }
   236  
   237  // Reset resets this Fetcher, preserving the memory capacity that was used
   238  // for the tables slice, and the slices within each of the tableInfo objects
   239  // within tables. This permits reuse of this objects without forcing total
   240  // reallocation of all of those slice fields.
   241  func (rf *Fetcher) Reset() {
   242  	*rf = Fetcher{
   243  		tables: rf.tables[:0],
   244  	}
   245  }
   246  
   247  // Init sets up a Fetcher for a given table and index. If we are using a
   248  // non-primary index, tables.ValNeededForCol can only refer to columns in the
   249  // index.
   250  func (rf *Fetcher) Init(
   251  	codec keys.SQLCodec,
   252  	reverse bool,
   253  	lockStr sqlbase.ScanLockingStrength,
   254  	returnRangeInfo bool,
   255  	isCheck bool,
   256  	alloc *sqlbase.DatumAlloc,
   257  	tables ...FetcherTableArgs,
   258  ) error {
   259  	if len(tables) == 0 {
   260  		return errors.AssertionFailedf("no tables to fetch from")
   261  	}
   262  
   263  	rf.codec = codec
   264  	rf.reverse = reverse
   265  	rf.lockStr = lockStr
   266  	rf.returnRangeInfo = returnRangeInfo
   267  	rf.alloc = alloc
   268  	rf.isCheck = isCheck
   269  
   270  	// We must always decode the index key if we need to distinguish between
   271  	// rows from more than one table.
   272  	nTables := len(tables)
   273  	multipleTables := nTables >= 2
   274  	rf.mustDecodeIndexKey = multipleTables
   275  	if multipleTables {
   276  		rf.allEquivSignatures = make(map[string]int, len(tables))
   277  	}
   278  
   279  	if cap(rf.tables) >= nTables {
   280  		rf.tables = rf.tables[:nTables]
   281  	} else {
   282  		rf.tables = make([]tableInfo, nTables)
   283  	}
   284  	for tableIdx, tableArgs := range tables {
   285  		oldTable := rf.tables[tableIdx]
   286  
   287  		table := tableInfo{
   288  			spans:            tableArgs.Spans,
   289  			desc:             tableArgs.Desc,
   290  			colIdxMap:        tableArgs.ColIdxMap,
   291  			index:            tableArgs.Index,
   292  			isSecondaryIndex: tableArgs.IsSecondaryIndex,
   293  			cols:             tableArgs.Cols,
   294  			row:              make(sqlbase.EncDatumRow, len(tableArgs.Cols)),
   295  			decodedRow:       make(tree.Datums, len(tableArgs.Cols)),
   296  
   297  			// These slice fields might get re-allocated below, so reslice them from
   298  			// the old table here in case they've got enough capacity already.
   299  			indexColIdx: oldTable.indexColIdx[:0],
   300  			keyVals:     oldTable.keyVals[:0],
   301  			extraVals:   oldTable.extraVals[:0],
   302  		}
   303  
   304  		var err error
   305  		if multipleTables {
   306  			// We produce references to every signature's reference.
   307  			equivSignatures, err := sqlbase.TableEquivSignatures(table.desc.TableDesc(), table.index)
   308  			if err != nil {
   309  				return err
   310  			}
   311  			for i, sig := range equivSignatures {
   312  				// We always map the table's equivalence signature (last
   313  				// 'sig' in 'equivSignatures') to its tableIdx.
   314  				// This allows us to overwrite previous "ancestor
   315  				// signatures" (see below).
   316  				if i == len(equivSignatures)-1 {
   317  					rf.allEquivSignatures[string(sig)] = tableIdx
   318  					break
   319  				}
   320  				// Map each table's ancestors' signatures to -1 so
   321  				// we know during ReadIndexKey if the parsed index
   322  				// key belongs to ancestor or one of our tables.
   323  				// We must check if the signature has already been set
   324  				// since it's possible for a later 'table' to have an
   325  				// ancestor that is a previous 'table', and we do not
   326  				// want to overwrite the previous table's tableIdx.
   327  				if _, exists := rf.allEquivSignatures[string(sig)]; !exists {
   328  					rf.allEquivSignatures[string(sig)] = -1
   329  				}
   330  			}
   331  			// The last signature is the given table's equivalence signature.
   332  			table.equivSignature = equivSignatures[len(equivSignatures)-1]
   333  		}
   334  
   335  		// Scan through the entire columns map to see which columns are
   336  		// required.
   337  		for col, idx := range table.colIdxMap {
   338  			if tableArgs.ValNeededForCol.Contains(idx) {
   339  				// The idx-th column is required.
   340  				table.neededCols.Add(int(col))
   341  			}
   342  		}
   343  
   344  		table.knownPrefixLength = len(
   345  			sqlbase.MakeIndexKeyPrefix(codec, table.desc.TableDesc(), table.index.ID),
   346  		)
   347  
   348  		var indexColumnIDs []sqlbase.ColumnID
   349  		indexColumnIDs, table.indexColumnDirs = table.index.FullColumnIDs()
   350  
   351  		table.neededValueColsByIdx = tableArgs.ValNeededForCol.Copy()
   352  		neededIndexCols := 0
   353  		nIndexCols := len(indexColumnIDs)
   354  		if cap(table.indexColIdx) >= nIndexCols {
   355  			table.indexColIdx = table.indexColIdx[:nIndexCols]
   356  		} else {
   357  			table.indexColIdx = make([]int, nIndexCols)
   358  		}
   359  		for i, id := range indexColumnIDs {
   360  			colIdx, ok := table.colIdxMap[id]
   361  			if ok {
   362  				table.indexColIdx[i] = colIdx
   363  				if table.neededCols.Contains(int(id)) {
   364  					neededIndexCols++
   365  					table.neededValueColsByIdx.Remove(colIdx)
   366  				}
   367  			} else {
   368  				table.indexColIdx[i] = -1
   369  				if table.neededCols.Contains(int(id)) {
   370  					return errors.AssertionFailedf("needed column %d not in colIdxMap", id)
   371  				}
   372  			}
   373  		}
   374  
   375  		// In order to track #40410 more effectively, check that the contents of
   376  		// table.neededValueColsByIdx are valid.
   377  		for idx, ok := table.neededValueColsByIdx.Next(0); ok; idx, ok = table.neededValueColsByIdx.Next(idx + 1) {
   378  			if idx >= len(table.row) || idx < 0 {
   379  				return errors.AssertionFailedf(
   380  					"neededValueColsByIdx contains an invalid index. column %d requested, but table has %d columns",
   381  					idx,
   382  					len(table.row),
   383  				)
   384  			}
   385  		}
   386  
   387  		// - If there is more than one table, we have to decode the index key to
   388  		//   figure out which table the row belongs to.
   389  		// - If there are interleaves, we need to read the index key in order to
   390  		//   determine whether this row is actually part of the index we're scanning.
   391  		// - If there are needed columns from the index key, we need to read it.
   392  		//
   393  		// Otherwise, we can completely avoid decoding the index key.
   394  		if !rf.mustDecodeIndexKey && (neededIndexCols > 0 || len(table.index.InterleavedBy) > 0 || len(table.index.Interleave.Ancestors) > 0) {
   395  			rf.mustDecodeIndexKey = true
   396  		}
   397  
   398  		// The number of columns we need to read from the value part of the key.
   399  		// It's the total number of needed columns minus the ones we read from the
   400  		// index key, except for composite columns.
   401  		table.neededValueCols = table.neededCols.Len() - neededIndexCols + len(table.index.CompositeColumnIDs)
   402  
   403  		if table.isSecondaryIndex {
   404  			for i := range table.cols {
   405  				if table.neededCols.Contains(int(table.cols[i].ID)) && !table.index.ContainsColumnID(table.cols[i].ID) {
   406  					return errors.Errorf("requested column %s not in index", table.cols[i].Name)
   407  				}
   408  			}
   409  		}
   410  
   411  		// Prepare our index key vals slice.
   412  		table.keyValTypes, err = sqlbase.GetColumnTypes(table.desc.TableDesc(), indexColumnIDs)
   413  		if err != nil {
   414  			return err
   415  		}
   416  		if cap(table.keyVals) >= nIndexCols {
   417  			table.keyVals = table.keyVals[:nIndexCols]
   418  		} else {
   419  			table.keyVals = make([]sqlbase.EncDatum, nIndexCols)
   420  		}
   421  
   422  		if hasExtraCols(&table) {
   423  			// Unique secondary indexes have a value that is the
   424  			// primary index key.
   425  			// Primary indexes only contain ascendingly-encoded
   426  			// values. If this ever changes, we'll probably have to
   427  			// figure out the directions here too.
   428  			table.extraTypes, err = sqlbase.GetColumnTypes(table.desc.TableDesc(), table.index.ExtraColumnIDs)
   429  			nExtraColumns := len(table.index.ExtraColumnIDs)
   430  			if cap(table.extraVals) >= nExtraColumns {
   431  				table.extraVals = table.extraVals[:nExtraColumns]
   432  			} else {
   433  				table.extraVals = make([]sqlbase.EncDatum, nExtraColumns)
   434  			}
   435  			if err != nil {
   436  				return err
   437  			}
   438  		}
   439  
   440  		// Keep track of the maximum keys per row to accommodate a
   441  		// limitHint when StartScan is invoked.
   442  		keysPerRow, err := table.desc.KeysPerRow(table.index.ID)
   443  		if err != nil {
   444  			return err
   445  		}
   446  		if keysPerRow > rf.maxKeysPerRow {
   447  			rf.maxKeysPerRow = keysPerRow
   448  		}
   449  
   450  		rf.tables[tableIdx] = table
   451  	}
   452  
   453  	if len(tables) == 1 {
   454  		// If there is more than one table, currentTable will be
   455  		// updated every time NextKey is invoked and rowReadyTable
   456  		// will be updated when a row is fully decoded.
   457  		rf.currentTable = &(rf.tables[0])
   458  		rf.rowReadyTable = &(rf.tables[0])
   459  	}
   460  
   461  	return nil
   462  }
   463  
   464  // StartScan initializes and starts the key-value scan. Can be used multiple
   465  // times.
   466  func (rf *Fetcher) StartScan(
   467  	ctx context.Context,
   468  	txn *kv.Txn,
   469  	spans roachpb.Spans,
   470  	limitBatches bool,
   471  	limitHint int64,
   472  	traceKV bool,
   473  ) error {
   474  	if len(spans) == 0 {
   475  		return errors.AssertionFailedf("no spans")
   476  	}
   477  
   478  	rf.traceKV = traceKV
   479  	f, err := makeKVBatchFetcher(
   480  		txn,
   481  		spans,
   482  		rf.reverse,
   483  		limitBatches,
   484  		rf.firstBatchLimit(limitHint),
   485  		rf.lockStr,
   486  		rf.returnRangeInfo,
   487  	)
   488  	if err != nil {
   489  		return err
   490  	}
   491  	return rf.StartScanFrom(ctx, &f)
   492  }
   493  
   494  // StartInconsistentScan initializes and starts an inconsistent scan, where each
   495  // KV batch can be read at a different historical timestamp.
   496  //
   497  // The scan uses the initial timestamp, until it becomes older than
   498  // maxTimestampAge; at this time the timestamp is bumped by the amount of time
   499  // that has passed. See the documentation for TableReaderSpec for more
   500  // details.
   501  //
   502  // Can be used multiple times.
   503  func (rf *Fetcher) StartInconsistentScan(
   504  	ctx context.Context,
   505  	db *kv.DB,
   506  	initialTimestamp hlc.Timestamp,
   507  	maxTimestampAge time.Duration,
   508  	spans roachpb.Spans,
   509  	limitBatches bool,
   510  	limitHint int64,
   511  	traceKV bool,
   512  ) error {
   513  	if len(spans) == 0 {
   514  		return errors.AssertionFailedf("no spans")
   515  	}
   516  
   517  	txnTimestamp := initialTimestamp
   518  	txnStartTime := timeutil.Now()
   519  	if txnStartTime.Sub(txnTimestamp.GoTime()) >= maxTimestampAge {
   520  		return errors.Errorf(
   521  			"AS OF SYSTEM TIME: cannot specify timestamp older than %s for this operation",
   522  			maxTimestampAge,
   523  		)
   524  	}
   525  	txn := kv.NewTxnWithSteppingEnabled(ctx, db, 0 /* gatewayNodeID */)
   526  	txn.SetFixedTimestamp(ctx, txnTimestamp)
   527  	if log.V(1) {
   528  		log.Infof(ctx, "starting inconsistent scan at timestamp %v", txnTimestamp)
   529  	}
   530  
   531  	sendFn := func(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, error) {
   532  		if now := timeutil.Now(); now.Sub(txnTimestamp.GoTime()) >= maxTimestampAge {
   533  			// Time to bump the transaction. First commit the old one (should be a no-op).
   534  			if err := txn.Commit(ctx); err != nil {
   535  				return nil, err
   536  			}
   537  			// Advance the timestamp by the time that passed.
   538  			txnTimestamp = txnTimestamp.Add(now.Sub(txnStartTime).Nanoseconds(), 0 /* logical */)
   539  			txnStartTime = now
   540  			txn = kv.NewTxnWithSteppingEnabled(ctx, db, 0 /* gatewayNodeID */)
   541  			txn.SetFixedTimestamp(ctx, txnTimestamp)
   542  
   543  			if log.V(1) {
   544  				log.Infof(ctx, "bumped inconsistent scan timestamp to %v", txnTimestamp)
   545  			}
   546  		}
   547  
   548  		res, err := txn.Send(ctx, ba)
   549  		if err != nil {
   550  			return nil, err.GoError()
   551  		}
   552  		return res, nil
   553  	}
   554  
   555  	// TODO(radu): we should commit the last txn. Right now the commit is a no-op
   556  	// on read transactions, but perhaps one day it will release some resources.
   557  
   558  	rf.traceKV = traceKV
   559  	f, err := makeKVBatchFetcherWithSendFunc(
   560  		sendFunc(sendFn),
   561  		spans,
   562  		rf.reverse,
   563  		limitBatches,
   564  		rf.firstBatchLimit(limitHint),
   565  		rf.lockStr,
   566  		rf.returnRangeInfo,
   567  	)
   568  	if err != nil {
   569  		return err
   570  	}
   571  	return rf.StartScanFrom(ctx, &f)
   572  }
   573  
   574  func (rf *Fetcher) firstBatchLimit(limitHint int64) int64 {
   575  	if limitHint == 0 {
   576  		return 0
   577  	}
   578  	// If we have a limit hint, we limit the first batch size. Subsequent
   579  	// batches get larger to avoid making things too slow (e.g. in case we have
   580  	// a very restrictive filter and actually have to retrieve a lot of rows).
   581  	// The limitHint is a row limit, but each row could be made up of more than
   582  	// one key. We take the maximum possible keys per row out of all the table
   583  	// rows we could potentially scan over.
   584  	//
   585  	// We add an extra key to make sure we form the last row.
   586  	return limitHint*int64(rf.maxKeysPerRow) + 1
   587  }
   588  
   589  // StartScanFrom initializes and starts a scan from the given kvBatchFetcher. Can be
   590  // used multiple times.
   591  func (rf *Fetcher) StartScanFrom(ctx context.Context, f kvBatchFetcher) error {
   592  	rf.indexKey = nil
   593  	rf.kvFetcher = newKVFetcher(f)
   594  	// Retrieve the first key.
   595  	_, err := rf.NextKey(ctx)
   596  	return err
   597  }
   598  
   599  // NextKey retrieves the next key/value and sets kv/kvEnd. Returns whether a row
   600  // has been completed.
   601  func (rf *Fetcher) NextKey(ctx context.Context) (rowDone bool, err error) {
   602  	var ok bool
   603  
   604  	for {
   605  		ok, rf.kv, _, err = rf.kvFetcher.NextKV(ctx)
   606  		if err != nil {
   607  			return false, err
   608  		}
   609  		rf.kvEnd = !ok
   610  		if rf.kvEnd {
   611  			// No more keys in the scan. We need to transition
   612  			// rf.rowReadyTable to rf.currentTable for the last
   613  			// row.
   614  			//
   615  			// NB: this assumes that the KV layer will never split a range
   616  			// between column families, which is a brittle assumption.
   617  			// See:
   618  			// https://github.com/cockroachdb/cockroach/pull/42056
   619  			rf.rowReadyTable = rf.currentTable
   620  			return true, nil
   621  		}
   622  
   623  		// foundNull is set when decoding a new index key for a row finds a NULL value
   624  		// in the index key. This is used when decoding unique secondary indexes in order
   625  		// to tell whether they have extra columns appended to the key.
   626  		var foundNull bool
   627  
   628  		// unchangedPrefix will be set to true if we can skip decoding the index key
   629  		// completely, because the last key we saw has identical prefix to the
   630  		// current key.
   631  		unchangedPrefix := rf.indexKey != nil && bytes.HasPrefix(rf.kv.Key, rf.indexKey)
   632  		if unchangedPrefix {
   633  			keySuffix := rf.kv.Key[len(rf.indexKey):]
   634  			if _, foundSentinel := encoding.DecodeIfInterleavedSentinel(keySuffix); foundSentinel {
   635  				// We found an interleaved sentinel, which means that the key we just
   636  				// found belongs to a different interleave. That means we have to go
   637  				// through with index key decoding.
   638  				unchangedPrefix = false
   639  			} else {
   640  				rf.keyRemainingBytes = keySuffix
   641  			}
   642  		}
   643  		// See Init() for a detailed description of when we can get away with not
   644  		// reading the index key.
   645  		if unchangedPrefix {
   646  			// Skip decoding!
   647  			// We must set the rowReadyTable to the currentTable like ReadIndexKey
   648  			// would do. This will happen when we see 2 rows in a row with the same
   649  			// prefix. If the previous prefix was from a different table, then we must
   650  			// update the ready table to the current table, updating the fetcher state
   651  			// machine to recognize that the next row that it outputs will be from
   652  			// rf.currentTable, which will be set to the table of the key that was
   653  			// last sent to ReadIndexKey.
   654  			//
   655  			// TODO(jordan): this is a major (but correct) mess. The fetcher is past
   656  			// due for a refactor, now that it's (more) clear what the state machine
   657  			// it's trying to model is.
   658  			rf.rowReadyTable = rf.currentTable
   659  		} else if rf.mustDecodeIndexKey || rf.traceKV {
   660  			rf.keyRemainingBytes, ok, foundNull, err = rf.ReadIndexKey(rf.kv.Key)
   661  			if err != nil {
   662  				return false, err
   663  			}
   664  			if !ok {
   665  				// The key did not match any of the table
   666  				// descriptors, which means it's interleaved
   667  				// data from some other table or index.
   668  				continue
   669  			}
   670  		} else {
   671  			// We still need to consume the key until the family
   672  			// id, so processKV can know whether we've finished a
   673  			// row or not.
   674  			prefixLen, err := keys.GetRowPrefixLength(rf.kv.Key)
   675  			if err != nil {
   676  				return false, err
   677  			}
   678  
   679  			rf.keyRemainingBytes = rf.kv.Key[prefixLen:]
   680  		}
   681  
   682  		// For unique secondary indexes, the index-key does not distinguish one row
   683  		// from the next if both rows contain identical values along with a NULL.
   684  		// Consider the keys:
   685  		//
   686  		//   /test/unique_idx/NULL/0
   687  		//   /test/unique_idx/NULL/1
   688  		//
   689  		// The index-key extracted from the above keys is /test/unique_idx/NULL. The
   690  		// trailing /0 and /1 are the primary key used to unique-ify the keys when a
   691  		// NULL is present. When a null is present in the index key, we cut off more
   692  		// of the index key so that the prefix includes the primary key columns.
   693  		//
   694  		// Note that we do not need to do this for non-unique secondary indexes because
   695  		// the extra columns in the primary key will _always_ be there, so we can decode
   696  		// them when processing the index. The difference with unique secondary indexes
   697  		// is that the extra columns are not always there, and are used to unique-ify
   698  		// the index key, rather than provide the primary key column values.
   699  		if foundNull && rf.currentTable.isSecondaryIndex && rf.currentTable.index.Unique && len(rf.currentTable.desc.Families) != 1 {
   700  			for range rf.currentTable.index.ExtraColumnIDs {
   701  				var err error
   702  				// Slice off an extra encoded column from rf.keyRemainingBytes.
   703  				rf.keyRemainingBytes, err = sqlbase.SkipTableKey(rf.keyRemainingBytes)
   704  				if err != nil {
   705  					return false, err
   706  				}
   707  			}
   708  		}
   709  
   710  		switch {
   711  		case len(rf.currentTable.desc.Families) == 1:
   712  			// If we only have one family, we know that there is only 1 k/v pair per row.
   713  			rowDone = true
   714  		case !unchangedPrefix:
   715  			// If the prefix of the key has changed, current key is from a different
   716  			// row than the previous one.
   717  			rowDone = true
   718  		case rf.rowReadyTable != rf.currentTable:
   719  			// For rowFetchers with more than one table, if the table changes the row
   720  			// is done.
   721  			rowDone = true
   722  		default:
   723  			rowDone = false
   724  		}
   725  
   726  		if rf.indexKey != nil && rowDone {
   727  			// The current key belongs to a new row. Output the
   728  			// current row.
   729  			rf.indexKey = nil
   730  			return true, nil
   731  		}
   732  
   733  		return false, nil
   734  	}
   735  }
   736  
   737  func (rf *Fetcher) prettyEncDatums(types []*types.T, vals []sqlbase.EncDatum) string {
   738  	var buf strings.Builder
   739  	for i, v := range vals {
   740  		if err := v.EnsureDecoded(types[i], rf.alloc); err != nil {
   741  			buf.WriteString("error decoding: ")
   742  			buf.WriteString(err.Error())
   743  		}
   744  		buf.WriteByte('/')
   745  		buf.WriteString(v.Datum.String())
   746  	}
   747  	return buf.String()
   748  }
   749  
   750  // ReadIndexKey decodes an index key for a given table.
   751  // It returns whether or not the key is for any of the tables initialized
   752  // in Fetcher, and the remaining part of the key if it is.
   753  // ReadIndexKey additionally returns whether or not it encountered a null while decoding.
   754  func (rf *Fetcher) ReadIndexKey(
   755  	key roachpb.Key,
   756  ) (remaining []byte, ok bool, foundNull bool, err error) {
   757  	// If there is only one table to check keys for, there is no need
   758  	// to go through the equivalence signature checks.
   759  	if len(rf.tables) == 1 {
   760  		return sqlbase.DecodeIndexKeyWithoutTableIDIndexIDPrefix(
   761  			rf.currentTable.desc.TableDesc(),
   762  			rf.currentTable.index,
   763  			rf.currentTable.keyValTypes,
   764  			rf.currentTable.keyVals,
   765  			rf.currentTable.indexColumnDirs,
   766  			key[rf.currentTable.knownPrefixLength:],
   767  		)
   768  	}
   769  
   770  	// Make a copy of the initial key for validating whether it's within
   771  	// the table's specified spans.
   772  	initialKey := key
   773  
   774  	// key now contains the bytes in the key (if match) that are not part
   775  	// of the signature in order.
   776  	tableIdx, key, match, err := sqlbase.IndexKeyEquivSignature(key, rf.allEquivSignatures, rf.keySigBuf, rf.keyRestBuf)
   777  	if err != nil {
   778  		return nil, false, false, err
   779  	}
   780  	// The index key does not belong to our table because either:
   781  	// !match:	    part of the index key's signature did not match any of
   782  	//		    rf.allEquivSignatures.
   783  	// tableIdx == -1:  index key belongs to an ancestor.
   784  	if !match || tableIdx == -1 {
   785  		return nil, false, false, nil
   786  	}
   787  
   788  	// The index key is not within our specified span of keys for the
   789  	// particular table.
   790  	// TODO(richardwu): ContainsKey checks every span within spans. We
   791  	// can check that spans is ordered (or sort it) and memoize
   792  	// the last span we've checked for each table. We can pass in this
   793  	// information to ContainsKey as a hint for which span to start
   794  	// checking first.
   795  	if !rf.tables[tableIdx].spans.ContainsKey(initialKey) {
   796  		return nil, false, false, nil
   797  	}
   798  
   799  	// Either a new table is encountered or the rowReadyTable differs from
   800  	// the currentTable (the rowReadyTable was outputted in the previous
   801  	// read). We transition the references.
   802  	if &rf.tables[tableIdx] != rf.currentTable || rf.rowReadyTable != rf.currentTable {
   803  		rf.rowReadyTable = rf.currentTable
   804  		rf.currentTable = &rf.tables[tableIdx]
   805  
   806  		// rf.rowReadyTable is nil if this is the very first key.
   807  		// We want to ensure this does not differ from rf.currentTable
   808  		// to prevent another transition.
   809  		if rf.rowReadyTable == nil {
   810  			rf.rowReadyTable = rf.currentTable
   811  		}
   812  	}
   813  
   814  	// We can simply decode all the column values we retrieved
   815  	// when processing the ind
   816  	// ex key. The column values are at the
   817  	// front of the key.
   818  	if key, foundNull, err = sqlbase.DecodeKeyVals(
   819  		rf.currentTable.keyValTypes,
   820  		rf.currentTable.keyVals,
   821  		rf.currentTable.indexColumnDirs,
   822  		key,
   823  	); err != nil {
   824  		return nil, false, false, err
   825  	}
   826  
   827  	return key, true, foundNull, nil
   828  }
   829  
   830  // processKV processes the given key/value, setting values in the row
   831  // accordingly. If debugStrings is true, returns pretty printed key and value
   832  // information in prettyKey/prettyValue (otherwise they are empty strings).
   833  func (rf *Fetcher) processKV(
   834  	ctx context.Context, kv roachpb.KeyValue,
   835  ) (prettyKey string, prettyValue string, err error) {
   836  	table := rf.currentTable
   837  
   838  	if rf.traceKV {
   839  		prettyKey = fmt.Sprintf(
   840  			"/%s/%s%s",
   841  			table.desc.Name,
   842  			table.index.Name,
   843  			rf.prettyEncDatums(table.keyValTypes, table.keyVals),
   844  		)
   845  	}
   846  
   847  	// Either this is the first key of the fetch or the first key of a new
   848  	// row.
   849  	if rf.indexKey == nil {
   850  		// This is the first key for the row.
   851  		rf.indexKey = []byte(kv.Key[:len(kv.Key)-len(rf.keyRemainingBytes)])
   852  
   853  		// Reset the row to nil; it will get filled in with the column
   854  		// values as we decode the key-value pairs for the row.
   855  		// We only need to reset the needed columns in the value component, because
   856  		// non-needed columns are never set and key columns are unconditionally set
   857  		// below.
   858  		for idx, ok := table.neededValueColsByIdx.Next(0); ok; idx, ok = table.neededValueColsByIdx.Next(idx + 1) {
   859  			table.row[idx].UnsetDatum()
   860  		}
   861  
   862  		// Fill in the column values that are part of the index key.
   863  		for i := range table.keyVals {
   864  			if idx := table.indexColIdx[i]; idx != -1 {
   865  				table.row[idx] = table.keyVals[i]
   866  			}
   867  		}
   868  
   869  		rf.valueColsFound = 0
   870  
   871  		// Reset the MVCC metadata for the next row.
   872  
   873  		// set rowLastModified to a sentinel that's before any real timestamp.
   874  		// As kvs are iterated for this row, it keeps track of the greatest
   875  		// timestamp seen.
   876  		table.rowLastModified = hlc.Timestamp{}
   877  		// All row encodings (both before and after column families) have a
   878  		// sentinel kv (column family 0) that is always present when a row is
   879  		// present, even if that row is all NULLs. Thus, a row is deleted if and
   880  		// only if the first kv in it a tombstone (RawBytes is empty).
   881  		table.rowIsDeleted = len(kv.Value.RawBytes) == 0
   882  	}
   883  
   884  	if table.rowLastModified.Less(kv.Value.Timestamp) {
   885  		table.rowLastModified = kv.Value.Timestamp
   886  	}
   887  
   888  	if table.neededCols.Empty() {
   889  		// We don't need to decode any values.
   890  		if rf.traceKV {
   891  			prettyValue = tree.DNull.String()
   892  		}
   893  		return prettyKey, prettyValue, nil
   894  	}
   895  
   896  	// For covering secondary indexes, allow for decoding as a primary key.
   897  	if table.index.GetEncodingType(table.desc.PrimaryIndex.ID) == sqlbase.PrimaryIndexEncoding &&
   898  		len(rf.keyRemainingBytes) > 0 {
   899  		// If familyID is 0, kv.Value contains values for composite key columns.
   900  		// These columns already have a table.row value assigned above, but that value
   901  		// (obtained from the key encoding) might not be correct (e.g. for decimals,
   902  		// it might not contain the right number of trailing 0s; for collated
   903  		// strings, it is one of potentially many strings with the same collation
   904  		// key).
   905  		//
   906  		// In these cases, the correct value will be present in family 0 and the
   907  		// table.row value gets overwritten.
   908  
   909  		switch kv.Value.GetTag() {
   910  		case roachpb.ValueType_TUPLE:
   911  			// In this case, we don't need to decode the column family ID, because
   912  			// the ValueType_TUPLE encoding includes the column id with every encoded
   913  			// column value.
   914  			prettyKey, prettyValue, err = rf.processValueTuple(ctx, table, kv, prettyKey)
   915  		default:
   916  			var familyID uint64
   917  			_, familyID, err = encoding.DecodeUvarintAscending(rf.keyRemainingBytes)
   918  			if err != nil {
   919  				return "", "", scrub.WrapError(scrub.IndexKeyDecodingError, err)
   920  			}
   921  
   922  			var family *sqlbase.ColumnFamilyDescriptor
   923  			family, err = table.desc.FindFamilyByID(sqlbase.FamilyID(familyID))
   924  			if err != nil {
   925  				return "", "", scrub.WrapError(scrub.IndexKeyDecodingError, err)
   926  			}
   927  
   928  			prettyKey, prettyValue, err = rf.processValueSingle(ctx, table, family, kv, prettyKey)
   929  		}
   930  		if err != nil {
   931  			return "", "", scrub.WrapError(scrub.IndexValueDecodingError, err)
   932  		}
   933  	} else {
   934  		tag := kv.Value.GetTag()
   935  		var valueBytes []byte
   936  		switch tag {
   937  		case roachpb.ValueType_BYTES:
   938  			// If we have the ValueType_BYTES on a secondary index, then we know we
   939  			// are looking at column family 0. Column family 0 stores the extra primary
   940  			// key columns if they are present, so we decode them here.
   941  			valueBytes, err = kv.Value.GetBytes()
   942  			if err != nil {
   943  				return "", "", scrub.WrapError(scrub.IndexValueDecodingError, err)
   944  			}
   945  			if hasExtraCols(table) {
   946  				// This is a unique secondary index; decode the extra
   947  				// column values from the value.
   948  				var err error
   949  				valueBytes, _, err = sqlbase.DecodeKeyVals(
   950  					table.extraTypes,
   951  					table.extraVals,
   952  					nil,
   953  					valueBytes,
   954  				)
   955  				if err != nil {
   956  					return "", "", scrub.WrapError(scrub.SecondaryIndexKeyExtraValueDecodingError, err)
   957  				}
   958  				for i, id := range table.index.ExtraColumnIDs {
   959  					if table.neededCols.Contains(int(id)) {
   960  						table.row[table.colIdxMap[id]] = table.extraVals[i]
   961  					}
   962  				}
   963  				if rf.traceKV {
   964  					prettyValue = rf.prettyEncDatums(table.extraTypes, table.extraVals)
   965  				}
   966  			}
   967  		case roachpb.ValueType_TUPLE:
   968  			valueBytes, err = kv.Value.GetTuple()
   969  			if err != nil {
   970  				return "", "", scrub.WrapError(scrub.IndexValueDecodingError, err)
   971  			}
   972  		}
   973  
   974  		if DebugRowFetch {
   975  			if hasExtraCols(table) && tag == roachpb.ValueType_BYTES {
   976  				log.Infof(ctx, "Scan %s -> %s", kv.Key, rf.prettyEncDatums(table.extraTypes, table.extraVals))
   977  			} else {
   978  				log.Infof(ctx, "Scan %s", kv.Key)
   979  			}
   980  		}
   981  
   982  		if len(valueBytes) > 0 {
   983  			prettyKey, prettyValue, err = rf.processValueBytes(
   984  				ctx, table, kv, valueBytes, prettyKey,
   985  			)
   986  			if err != nil {
   987  				return "", "", scrub.WrapError(scrub.IndexValueDecodingError, err)
   988  			}
   989  		}
   990  	}
   991  
   992  	if rf.traceKV && prettyValue == "" {
   993  		prettyValue = tree.DNull.String()
   994  	}
   995  
   996  	return prettyKey, prettyValue, nil
   997  }
   998  
   999  // processValueSingle processes the given value (of column
  1000  // family.DefaultColumnID), setting values in table.row accordingly. The key is
  1001  // only used for logging.
  1002  func (rf *Fetcher) processValueSingle(
  1003  	ctx context.Context,
  1004  	table *tableInfo,
  1005  	family *sqlbase.ColumnFamilyDescriptor,
  1006  	kv roachpb.KeyValue,
  1007  	prettyKeyPrefix string,
  1008  ) (prettyKey string, prettyValue string, err error) {
  1009  	prettyKey = prettyKeyPrefix
  1010  
  1011  	// If this is the row sentinel (in the legacy pre-family format),
  1012  	// a value is not expected, so we're done.
  1013  	if family.ID == 0 {
  1014  		return "", "", nil
  1015  	}
  1016  
  1017  	colID := family.DefaultColumnID
  1018  	if colID == 0 {
  1019  		return "", "", errors.Errorf("single entry value with no default column id")
  1020  	}
  1021  
  1022  	if rf.traceKV || table.neededCols.Contains(int(colID)) {
  1023  		if idx, ok := table.colIdxMap[colID]; ok {
  1024  			if rf.traceKV {
  1025  				prettyKey = fmt.Sprintf("%s/%s", prettyKey, table.desc.DeletableColumns()[idx].Name)
  1026  			}
  1027  			if len(kv.Value.RawBytes) == 0 {
  1028  				return prettyKey, "", nil
  1029  			}
  1030  			typ := table.cols[idx].Type
  1031  			// TODO(arjun): The value is a directly marshaled single value, so we
  1032  			// unmarshal it eagerly here. This can potentially be optimized out,
  1033  			// although that would require changing UnmarshalColumnValue to operate
  1034  			// on bytes, and for Encode/DecodeTableValue to operate on marshaled
  1035  			// single values.
  1036  			value, err := sqlbase.UnmarshalColumnValue(rf.alloc, typ, kv.Value)
  1037  			if err != nil {
  1038  				return "", "", err
  1039  			}
  1040  			if rf.traceKV {
  1041  				prettyValue = value.String()
  1042  			}
  1043  			table.row[idx] = sqlbase.DatumToEncDatum(typ, value)
  1044  			if DebugRowFetch {
  1045  				log.Infof(ctx, "Scan %s -> %v", kv.Key, value)
  1046  			}
  1047  			return prettyKey, prettyValue, nil
  1048  		}
  1049  	}
  1050  
  1051  	// No need to unmarshal the column value. Either the column was part of
  1052  	// the index key or it isn't needed.
  1053  	if DebugRowFetch {
  1054  		log.Infof(ctx, "Scan %s -> [%d] (skipped)", kv.Key, colID)
  1055  	}
  1056  	return prettyKey, prettyValue, nil
  1057  }
  1058  
  1059  func (rf *Fetcher) processValueBytes(
  1060  	ctx context.Context,
  1061  	table *tableInfo,
  1062  	kv roachpb.KeyValue,
  1063  	valueBytes []byte,
  1064  	prettyKeyPrefix string,
  1065  ) (prettyKey string, prettyValue string, err error) {
  1066  	prettyKey = prettyKeyPrefix
  1067  	if rf.traceKV {
  1068  		if rf.prettyValueBuf == nil {
  1069  			rf.prettyValueBuf = &bytes.Buffer{}
  1070  		}
  1071  		rf.prettyValueBuf.Reset()
  1072  	}
  1073  
  1074  	var colIDDiff uint32
  1075  	var lastColID sqlbase.ColumnID
  1076  	var typeOffset, dataOffset int
  1077  	var typ encoding.Type
  1078  	for len(valueBytes) > 0 && rf.valueColsFound < table.neededValueCols {
  1079  		typeOffset, dataOffset, colIDDiff, typ, err = encoding.DecodeValueTag(valueBytes)
  1080  		if err != nil {
  1081  			return "", "", err
  1082  		}
  1083  		colID := lastColID + sqlbase.ColumnID(colIDDiff)
  1084  		lastColID = colID
  1085  		if !table.neededCols.Contains(int(colID)) {
  1086  			// This column wasn't requested, so read its length and skip it.
  1087  			len, err := encoding.PeekValueLengthWithOffsetsAndType(valueBytes, dataOffset, typ)
  1088  			if err != nil {
  1089  				return "", "", err
  1090  			}
  1091  			valueBytes = valueBytes[len:]
  1092  			if DebugRowFetch {
  1093  				log.Infof(ctx, "Scan %s -> [%d] (skipped)", kv.Key, colID)
  1094  			}
  1095  			continue
  1096  		}
  1097  		idx := table.colIdxMap[colID]
  1098  
  1099  		if rf.traceKV {
  1100  			prettyKey = fmt.Sprintf("%s/%s", prettyKey, table.desc.DeletableColumns()[idx].Name)
  1101  		}
  1102  
  1103  		var encValue sqlbase.EncDatum
  1104  		encValue, valueBytes, err = sqlbase.EncDatumValueFromBufferWithOffsetsAndType(valueBytes, typeOffset,
  1105  			dataOffset, typ)
  1106  		if err != nil {
  1107  			return "", "", err
  1108  		}
  1109  		if rf.traceKV {
  1110  			err := encValue.EnsureDecoded(table.cols[idx].Type, rf.alloc)
  1111  			if err != nil {
  1112  				return "", "", err
  1113  			}
  1114  			fmt.Fprintf(rf.prettyValueBuf, "/%v", encValue.Datum)
  1115  		}
  1116  		table.row[idx] = encValue
  1117  		rf.valueColsFound++
  1118  		if DebugRowFetch {
  1119  			log.Infof(ctx, "Scan %d -> %v", idx, encValue)
  1120  		}
  1121  	}
  1122  	if rf.traceKV {
  1123  		prettyValue = rf.prettyValueBuf.String()
  1124  	}
  1125  	return prettyKey, prettyValue, nil
  1126  }
  1127  
  1128  // processValueTuple processes the given values (of columns family.ColumnIDs),
  1129  // setting values in the rf.row accordingly. The key is only used for logging.
  1130  func (rf *Fetcher) processValueTuple(
  1131  	ctx context.Context, table *tableInfo, kv roachpb.KeyValue, prettyKeyPrefix string,
  1132  ) (prettyKey string, prettyValue string, err error) {
  1133  	tupleBytes, err := kv.Value.GetTuple()
  1134  	if err != nil {
  1135  		return "", "", err
  1136  	}
  1137  	return rf.processValueBytes(ctx, table, kv, tupleBytes, prettyKeyPrefix)
  1138  }
  1139  
  1140  // NextRow processes keys until we complete one row, which is returned as an
  1141  // EncDatumRow. The row contains one value per table column, regardless of the
  1142  // index used; values that are not needed (as per neededCols) are nil. The
  1143  // EncDatumRow should not be modified and is only valid until the next call.
  1144  // When there are no more rows, the EncDatumRow is nil. The error returned may
  1145  // be a scrub.ScrubError, which the caller is responsible for unwrapping.
  1146  // It also returns the table and index descriptor associated with the row
  1147  // (relevant when more than one table is specified during initialization).
  1148  func (rf *Fetcher) NextRow(
  1149  	ctx context.Context,
  1150  ) (
  1151  	row sqlbase.EncDatumRow,
  1152  	table *sqlbase.TableDescriptor,
  1153  	index *sqlbase.IndexDescriptor,
  1154  	err error,
  1155  ) {
  1156  	if rf.kvEnd {
  1157  		return nil, nil, nil, nil
  1158  	}
  1159  
  1160  	// All of the columns for a particular row will be grouped together. We
  1161  	// loop over the key/value pairs and decode the key to extract the
  1162  	// columns encoded within the key and the column ID. We use the column
  1163  	// ID to lookup the column and decode the value. All of these values go
  1164  	// into a map keyed by column name. When the index key changes we
  1165  	// output a row containing the current values.
  1166  	for {
  1167  		prettyKey, prettyVal, err := rf.processKV(ctx, rf.kv)
  1168  		if err != nil {
  1169  			return nil, nil, nil, err
  1170  		}
  1171  		if rf.traceKV {
  1172  			log.VEventf(ctx, 2, "fetched: %s -> %s", prettyKey, prettyVal)
  1173  		}
  1174  
  1175  		if rf.isCheck {
  1176  			rf.rowReadyTable.lastKV = rf.kv
  1177  		}
  1178  		rowDone, err := rf.NextKey(ctx)
  1179  		if err != nil {
  1180  			return nil, nil, nil, err
  1181  		}
  1182  		if rowDone {
  1183  			err := rf.finalizeRow()
  1184  			return rf.rowReadyTable.row, rf.rowReadyTable.desc.TableDesc(), rf.rowReadyTable.index, err
  1185  		}
  1186  	}
  1187  }
  1188  
  1189  // NextRowDecoded calls NextRow and decodes the EncDatumRow into a Datums.
  1190  // The Datums should not be modified and is only valid until the next call.
  1191  // When there are no more rows, the Datums is nil.
  1192  // It also returns the table and index descriptor associated with the row
  1193  // (relevant when more than one table is specified during initialization).
  1194  func (rf *Fetcher) NextRowDecoded(
  1195  	ctx context.Context,
  1196  ) (
  1197  	datums tree.Datums,
  1198  	table *sqlbase.TableDescriptor,
  1199  	index *sqlbase.IndexDescriptor,
  1200  	err error,
  1201  ) {
  1202  	row, table, index, err := rf.NextRow(ctx)
  1203  	if err != nil {
  1204  		err = scrub.UnwrapScrubError(err)
  1205  		return nil, nil, nil, err
  1206  	}
  1207  	if row == nil {
  1208  		return nil, nil, nil, nil
  1209  	}
  1210  
  1211  	for i, encDatum := range row {
  1212  		if encDatum.IsUnset() {
  1213  			rf.rowReadyTable.decodedRow[i] = tree.DNull
  1214  			continue
  1215  		}
  1216  		if err := encDatum.EnsureDecoded(rf.rowReadyTable.cols[i].Type, rf.alloc); err != nil {
  1217  			return nil, nil, nil, err
  1218  		}
  1219  		rf.rowReadyTable.decodedRow[i] = encDatum.Datum
  1220  	}
  1221  
  1222  	return rf.rowReadyTable.decodedRow, table, index, nil
  1223  }
  1224  
  1225  // RowLastModified may only be called after NextRow has returned a non-nil row
  1226  // and returns the timestamp of the last modification to that row.
  1227  func (rf *Fetcher) RowLastModified() hlc.Timestamp {
  1228  	return rf.rowReadyTable.rowLastModified
  1229  }
  1230  
  1231  // RowIsDeleted may only be called after NextRow has returned a non-nil row and
  1232  // returns true if that row was most recently deleted. This method is only
  1233  // meaningful when the configured kvBatchFetcher returns deletion tombstones, which
  1234  // the normal one (via `StartScan`) does not.
  1235  func (rf *Fetcher) RowIsDeleted() bool {
  1236  	return rf.rowReadyTable.rowIsDeleted
  1237  }
  1238  
  1239  // NextRowWithErrors calls NextRow to fetch the next row and also run
  1240  // additional additional logic for physical checks. The Datums should
  1241  // not be modified and are only valid until the next call. When there
  1242  // are no more rows, the Datums is nil. The checks executed include:
  1243  //  - k/v data round-trips, i.e. it decodes and re-encodes to the same
  1244  //    value.
  1245  //  - There is no extra unexpected or incorrect data encoded in the k/v
  1246  //    pair.
  1247  //  - Decoded keys follow the same ordering as their encoding.
  1248  func (rf *Fetcher) NextRowWithErrors(ctx context.Context) (sqlbase.EncDatumRow, error) {
  1249  	row, table, index, err := rf.NextRow(ctx)
  1250  	if row == nil {
  1251  		return nil, nil
  1252  	} else if err != nil {
  1253  		// If this is not already a wrapped error, we will consider it to be
  1254  		// a generic physical error.
  1255  		// FIXME(joey): This may not be needed if we capture all the errors
  1256  		// encountered. This is a TBD when this change is polished.
  1257  		if !scrub.IsScrubError(err) {
  1258  			err = scrub.WrapError(scrub.PhysicalError, err)
  1259  		}
  1260  		return row, err
  1261  	}
  1262  
  1263  	// Decode the row in-place. The following check datum encoding
  1264  	// functions require that the table.row datums are decoded.
  1265  	for i := range row {
  1266  		if row[i].IsUnset() {
  1267  			rf.rowReadyTable.decodedRow[i] = tree.DNull
  1268  			continue
  1269  		}
  1270  		if err := row[i].EnsureDecoded(rf.rowReadyTable.cols[i].Type, rf.alloc); err != nil {
  1271  			return nil, err
  1272  		}
  1273  		rf.rowReadyTable.decodedRow[i] = row[i].Datum
  1274  	}
  1275  
  1276  	if index.ID == table.PrimaryIndex.ID {
  1277  		err = rf.checkPrimaryIndexDatumEncodings(ctx)
  1278  	} else {
  1279  		err = rf.checkSecondaryIndexDatumEncodings(ctx)
  1280  	}
  1281  	if err != nil {
  1282  		return row, err
  1283  	}
  1284  
  1285  	err = rf.checkKeyOrdering(ctx)
  1286  
  1287  	return row, err
  1288  }
  1289  
  1290  // checkPrimaryIndexDatumEncodings will run a round-trip encoding check
  1291  // on all values in the buffered row. This check is specific to primary
  1292  // index datums.
  1293  func (rf *Fetcher) checkPrimaryIndexDatumEncodings(ctx context.Context) error {
  1294  	table := rf.rowReadyTable
  1295  	scratch := make([]byte, 1024)
  1296  	colIDToColumn := make(map[sqlbase.ColumnID]*sqlbase.ColumnDescriptor)
  1297  	for i := range table.desc.Columns {
  1298  		col := &table.desc.Columns[i]
  1299  		colIDToColumn[col.ID] = col
  1300  	}
  1301  
  1302  	rh := rowHelper{TableDesc: table.desc, Indexes: table.desc.Indexes}
  1303  
  1304  	for i := range table.desc.Families {
  1305  		var lastColID sqlbase.ColumnID
  1306  		familyID := table.desc.Families[i].ID
  1307  		familySortedColumnIDs, ok := rh.sortedColumnFamily(familyID)
  1308  		if !ok {
  1309  			return errors.AssertionFailedf("invalid family sorted column id map for family %d", familyID)
  1310  		}
  1311  
  1312  		for _, colID := range familySortedColumnIDs {
  1313  			rowVal := table.row[table.colIdxMap[colID]]
  1314  			if rowVal.IsNull() {
  1315  				// Column is not present.
  1316  				continue
  1317  			}
  1318  
  1319  			if skip, err := rh.skipColumnInPK(colID, familyID, rowVal.Datum); err != nil {
  1320  				return errors.NewAssertionErrorWithWrappedErrf(err, "unable to determine skip")
  1321  			} else if skip {
  1322  				continue
  1323  			}
  1324  
  1325  			col := colIDToColumn[colID]
  1326  			if col == nil {
  1327  				return errors.AssertionFailedf("column mapping not found for column %d", colID)
  1328  			}
  1329  
  1330  			if lastColID > col.ID {
  1331  				return errors.AssertionFailedf("cannot write column id %d after %d", col.ID, lastColID)
  1332  			}
  1333  			colIDDiff := col.ID - lastColID
  1334  			lastColID = col.ID
  1335  
  1336  			if result, err := sqlbase.EncodeTableValue([]byte(nil), colIDDiff, rowVal.Datum,
  1337  				scratch); err != nil {
  1338  				return errors.NewAssertionErrorWithWrappedErrf(err, "could not re-encode column %s, value was %#v",
  1339  					col.Name, rowVal.Datum)
  1340  			} else if !rowVal.BytesEqual(result) {
  1341  				return scrub.WrapError(scrub.IndexValueDecodingError, errors.Errorf(
  1342  					"value failed to round-trip encode. Column=%s colIDDiff=%d Key=%s expected %#v, got: %#v",
  1343  					col.Name, colIDDiff, rf.kv.Key, rowVal.EncodedString(), result))
  1344  			}
  1345  		}
  1346  	}
  1347  	return nil
  1348  }
  1349  
  1350  // checkSecondaryIndexDatumEncodings will run a round-trip encoding
  1351  // check on all values in the buffered row. This check is specific to
  1352  // secondary index datums.
  1353  func (rf *Fetcher) checkSecondaryIndexDatumEncodings(ctx context.Context) error {
  1354  	table := rf.rowReadyTable
  1355  	colToEncDatum := make(map[sqlbase.ColumnID]sqlbase.EncDatum, len(table.row))
  1356  	values := make(tree.Datums, len(table.row))
  1357  	for i, col := range table.cols {
  1358  		colToEncDatum[col.ID] = table.row[i]
  1359  		values[i] = table.row[i].Datum
  1360  	}
  1361  
  1362  	// The below code makes incorrect checks (#45256).
  1363  	indexEntries, err := sqlbase.EncodeSecondaryIndex(
  1364  		rf.codec, table.desc.TableDesc(), table.index, table.colIdxMap, values, false /* includeEmpty */)
  1365  	if err != nil {
  1366  		return err
  1367  	}
  1368  
  1369  	for _, indexEntry := range indexEntries {
  1370  		// We ignore the first 4 bytes of the values. These bytes are a
  1371  		// checksum which are not set by EncodeSecondaryIndex.
  1372  		if !indexEntry.Key.Equal(rf.rowReadyTable.lastKV.Key) {
  1373  			return scrub.WrapError(scrub.IndexKeyDecodingError, errors.Errorf(
  1374  				"secondary index key failed to round-trip encode. expected %#v, got: %#v",
  1375  				rf.rowReadyTable.lastKV.Key, indexEntry.Key))
  1376  		} else if !indexEntry.Value.EqualData(table.lastKV.Value) {
  1377  			return scrub.WrapError(scrub.IndexValueDecodingError, errors.Errorf(
  1378  				"secondary index value failed to round-trip encode. expected %#v, got: %#v",
  1379  				rf.rowReadyTable.lastKV.Value, indexEntry.Value))
  1380  		}
  1381  	}
  1382  	return nil
  1383  }
  1384  
  1385  // checkKeyOrdering verifies that the datums decoded for the current key
  1386  // have the same ordering as the encoded key.
  1387  func (rf *Fetcher) checkKeyOrdering(ctx context.Context) error {
  1388  	defer func() {
  1389  		rf.rowReadyTable.lastDatums = append(tree.Datums(nil), rf.rowReadyTable.decodedRow...)
  1390  	}()
  1391  
  1392  	if !rf.rowReadyTable.hasLast {
  1393  		rf.rowReadyTable.hasLast = true
  1394  		return nil
  1395  	}
  1396  
  1397  	evalCtx := tree.EvalContext{}
  1398  	// Iterate through columns in order, comparing each value to the value in the
  1399  	// previous row in that column. When the first column with a differing value
  1400  	// is found, compare the values to ensure the ordering matches the column
  1401  	// ordering.
  1402  	for i, id := range rf.rowReadyTable.index.ColumnIDs {
  1403  		idx := rf.rowReadyTable.colIdxMap[id]
  1404  		result := rf.rowReadyTable.decodedRow[idx].Compare(&evalCtx, rf.rowReadyTable.lastDatums[idx])
  1405  		expectedDirection := rf.rowReadyTable.index.ColumnDirections[i]
  1406  		if rf.reverse && expectedDirection == sqlbase.IndexDescriptor_ASC {
  1407  			expectedDirection = sqlbase.IndexDescriptor_DESC
  1408  		} else if rf.reverse && expectedDirection == sqlbase.IndexDescriptor_DESC {
  1409  			expectedDirection = sqlbase.IndexDescriptor_ASC
  1410  		}
  1411  
  1412  		if result != 0 {
  1413  			if expectedDirection == sqlbase.IndexDescriptor_ASC && result < 0 ||
  1414  				expectedDirection == sqlbase.IndexDescriptor_DESC && result > 0 {
  1415  				return scrub.WrapError(scrub.IndexKeyDecodingError,
  1416  					errors.Errorf("key ordering did not match datum ordering. IndexDescriptor=%s",
  1417  						expectedDirection))
  1418  			}
  1419  			// After the first column with a differing value is found, the remaining
  1420  			// columns are skipped (see #32874).
  1421  			break
  1422  		}
  1423  	}
  1424  	return nil
  1425  }
  1426  
  1427  func (rf *Fetcher) finalizeRow() error {
  1428  	table := rf.rowReadyTable
  1429  	// Fill in any missing values with NULLs
  1430  	for i := range table.cols {
  1431  		if rf.valueColsFound == table.neededValueCols {
  1432  			// Found all cols - done!
  1433  			return nil
  1434  		}
  1435  		if table.neededCols.Contains(int(table.cols[i].ID)) && table.row[i].IsUnset() {
  1436  			// If the row was deleted, we'll be missing any non-primary key
  1437  			// columns, including nullable ones, but this is expected.
  1438  			if !table.cols[i].Nullable && !table.rowIsDeleted {
  1439  				var indexColValues []string
  1440  				for _, idx := range table.indexColIdx {
  1441  					if idx != -1 {
  1442  						indexColValues = append(indexColValues, table.row[idx].String(table.cols[idx].Type))
  1443  					} else {
  1444  						indexColValues = append(indexColValues, "?")
  1445  					}
  1446  				}
  1447  				err := errors.AssertionFailedf(
  1448  					"Non-nullable column \"%s:%s\" with no value! Index scanned was %q with the index key columns (%s) and the values (%s)",
  1449  					table.desc.Name, table.cols[i].Name, table.index.Name,
  1450  					strings.Join(table.index.ColumnNames, ","), strings.Join(indexColValues, ","))
  1451  
  1452  				if rf.isCheck {
  1453  					return scrub.WrapError(scrub.UnexpectedNullValueError, err)
  1454  				}
  1455  				return err
  1456  			}
  1457  			table.row[i] = sqlbase.EncDatum{
  1458  				Datum: tree.DNull,
  1459  			}
  1460  			// We've set valueColsFound to the number of present columns in the row
  1461  			// already, in processValueBytes. Now, we're filling in columns that have
  1462  			// no encoded values with NULL - so we increment valueColsFound to permit
  1463  			// early exit from this loop once all needed columns are filled in.
  1464  			rf.valueColsFound++
  1465  		}
  1466  	}
  1467  	return nil
  1468  }
  1469  
  1470  // Key returns the next key (the key that follows the last returned row).
  1471  // Key returns nil when there are no more rows.
  1472  func (rf *Fetcher) Key() roachpb.Key {
  1473  	return rf.kv.Key
  1474  }
  1475  
  1476  // PartialKey returns a partial slice of the next key (the key that follows the
  1477  // last returned row) containing nCols columns, without the ending column
  1478  // family. Returns nil when there are no more rows.
  1479  func (rf *Fetcher) PartialKey(nCols int) (roachpb.Key, error) {
  1480  	if rf.kv.Key == nil {
  1481  		return nil, nil
  1482  	}
  1483  	n, err := consumeIndexKeyWithoutTableIDIndexIDPrefix(
  1484  		rf.currentTable.index, nCols, rf.kv.Key[rf.currentTable.knownPrefixLength:])
  1485  	if err != nil {
  1486  		return nil, err
  1487  	}
  1488  	return rf.kv.Key[:n+rf.currentTable.knownPrefixLength], nil
  1489  }
  1490  
  1491  // GetRangesInfo returns information about the ranges where the rows came from.
  1492  // The RangeInfo's are deduped and not ordered.
  1493  func (rf *Fetcher) GetRangesInfo() []roachpb.RangeInfo {
  1494  	f := rf.kvFetcher
  1495  	if f == nil {
  1496  		// Not yet initialized.
  1497  		return nil
  1498  	}
  1499  	return f.GetRangesInfo()
  1500  }
  1501  
  1502  // GetBytesRead returns total number of bytes read by the underlying KVFetcher.
  1503  func (rf *Fetcher) GetBytesRead() int64 {
  1504  	f := rf.kvFetcher
  1505  	if f == nil {
  1506  		// Not yet initialized.
  1507  		return 0
  1508  	}
  1509  	return f.bytesRead
  1510  }
  1511  
  1512  // Only unique secondary indexes have extra columns to decode (namely the
  1513  // primary index columns).
  1514  func hasExtraCols(table *tableInfo) bool {
  1515  	return table.isSecondaryIndex && table.index.Unique
  1516  }
  1517  
  1518  // consumeIndexKeyWithoutTableIDIndexIDPrefix consumes an index key that's
  1519  // already pre-stripped of its table ID index ID prefix, up to nCols columns,
  1520  // returning the number of bytes consumed. For example, given an input key
  1521  // with values (6,7,8,9) such as /Table/60/1/6/7/#/61/1/8/9, stripping 3 columns
  1522  // from this key would eat all but the final, 4th column 9 in this example,
  1523  // producing /Table/60/1/6/7/#/61/1/8. If nCols was 2, instead, the result
  1524  // would include the trailing table ID index ID pair, since that's a more
  1525  // precise key: /Table/60/1/6/7/#/61/1.
  1526  func consumeIndexKeyWithoutTableIDIndexIDPrefix(
  1527  	index *sqlbase.IndexDescriptor, nCols int, key []byte,
  1528  ) (int, error) {
  1529  	origKeyLen := len(key)
  1530  	consumedCols := 0
  1531  	for _, ancestor := range index.Interleave.Ancestors {
  1532  		length := int(ancestor.SharedPrefixLen)
  1533  		// Skip up to length values.
  1534  		for j := 0; j < length; j++ {
  1535  			if consumedCols == nCols {
  1536  				// We're done early, in the middle of an interleave.
  1537  				return origKeyLen - len(key), nil
  1538  			}
  1539  			l, err := encoding.PeekLength(key)
  1540  			if err != nil {
  1541  				return 0, err
  1542  			}
  1543  			key = key[l:]
  1544  			consumedCols++
  1545  		}
  1546  		var ok bool
  1547  		key, ok = encoding.DecodeIfInterleavedSentinel(key)
  1548  		if !ok {
  1549  			return 0, errors.New("unexpected lack of sentinel key")
  1550  		}
  1551  
  1552  		// Skip the TableID/IndexID pair for each ancestor except for the
  1553  		// first, which has already been skipped in our input.
  1554  		for j := 0; j < 2; j++ {
  1555  			idLen, err := encoding.PeekLength(key)
  1556  			if err != nil {
  1557  				return 0, err
  1558  			}
  1559  			key = key[idLen:]
  1560  		}
  1561  	}
  1562  
  1563  	// Decode the remaining values in the key, in the final interleave.
  1564  	for ; consumedCols < nCols; consumedCols++ {
  1565  		l, err := encoding.PeekLength(key)
  1566  		if err != nil {
  1567  			return 0, err
  1568  		}
  1569  		key = key[l:]
  1570  	}
  1571  
  1572  	return origKeyLen - len(key), nil
  1573  }