github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/row/fetcher.go (about) 1 // Copyright 2017 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package row 12 13 import ( 14 "bytes" 15 "context" 16 "fmt" 17 "strings" 18 "time" 19 20 "github.com/cockroachdb/cockroach/pkg/keys" 21 "github.com/cockroachdb/cockroach/pkg/kv" 22 "github.com/cockroachdb/cockroach/pkg/roachpb" 23 "github.com/cockroachdb/cockroach/pkg/sql/scrub" 24 "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" 25 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 26 "github.com/cockroachdb/cockroach/pkg/sql/types" 27 "github.com/cockroachdb/cockroach/pkg/util" 28 "github.com/cockroachdb/cockroach/pkg/util/encoding" 29 "github.com/cockroachdb/cockroach/pkg/util/hlc" 30 "github.com/cockroachdb/cockroach/pkg/util/log" 31 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 32 "github.com/cockroachdb/errors" 33 ) 34 35 // DebugRowFetch can be used to turn on some low-level debugging logs. We use 36 // this to avoid using log.V in the hot path. 37 const DebugRowFetch = false 38 39 type kvBatchFetcher interface { 40 // nextBatch returns the next batch of rows. Returns false in the first 41 // parameter if there are no more keys in the scan. May return either a slice 42 // of KeyValues or a batchResponse, numKvs pair, depending on the server 43 // version - both must be handled by calling code. 44 nextBatch(ctx context.Context) (ok bool, kvs []roachpb.KeyValue, 45 batchResponse []byte, origSpan roachpb.Span, err error) 46 GetRangesInfo() []roachpb.RangeInfo 47 } 48 49 type tableInfo struct { 50 // -- Fields initialized once -- 51 52 // Used to determine whether a key retrieved belongs to the span we 53 // want to scan. 54 spans roachpb.Spans 55 desc *sqlbase.ImmutableTableDescriptor 56 index *sqlbase.IndexDescriptor 57 isSecondaryIndex bool 58 indexColumnDirs []sqlbase.IndexDescriptor_Direction 59 // equivSignature is an equivalence class for each unique table-index 60 // pair. It allows us to check if an index key belongs to a given 61 // table-index. 62 equivSignature []byte 63 64 // The table columns to use for fetching, possibly including ones currently in 65 // schema changes. 66 cols []sqlbase.ColumnDescriptor 67 68 // The set of ColumnIDs that are required. 69 neededCols util.FastIntSet 70 71 // The set of indexes into the cols array that are required for columns 72 // in the value part. 73 neededValueColsByIdx util.FastIntSet 74 75 // The number of needed columns from the value part of the row. Once we've 76 // seen this number of value columns for a particular row, we can stop 77 // decoding values in that row. 78 neededValueCols int 79 80 // Map used to get the index for columns in cols. 81 colIdxMap map[sqlbase.ColumnID]int 82 83 // One value per column that is part of the key; each value is a column 84 // index (into cols); -1 if we don't need the value for that column. 85 indexColIdx []int 86 87 // knownPrefixLength is the number of bytes in the index key prefix this 88 // Fetcher is configured for. The index key prefix is the table id, index 89 // id pair at the start of the key. 90 knownPrefixLength int 91 92 // -- Fields updated during a scan -- 93 94 keyValTypes []*types.T 95 extraTypes []*types.T 96 keyVals []sqlbase.EncDatum 97 extraVals []sqlbase.EncDatum 98 row sqlbase.EncDatumRow 99 decodedRow tree.Datums 100 101 // The following fields contain MVCC metadata for each row and may be 102 // returned to users of Fetcher immediately after NextRow returns. 103 // They're not important to ordinary consumers of Fetcher that only 104 // concern themselves with actual SQL row data. 105 // 106 // rowLastModified is the timestamp of the last time any family in the row 107 // was modified in any way. 108 rowLastModified hlc.Timestamp 109 // rowIsDeleted is true when the row has been deleted. This is only 110 // meaningful when kv deletion tombstones are returned by the kvBatchFetcher, 111 // which the one used by `StartScan` (the common case) doesnt. Notably, 112 // changefeeds use this by providing raw kvs with tombstones unfiltered via 113 // `StartScanFrom`. 114 rowIsDeleted bool 115 116 // hasLast indicates whether there was a previously scanned k/v. 117 hasLast bool 118 // lastDatums is a buffer for the current key. It is only present when 119 // doing a physical check in order to verify round-trip encoding. 120 // It is required because Fetcher.kv is overwritten before NextRow 121 // returns. 122 lastKV roachpb.KeyValue 123 // lastDatums is a buffer for the previously scanned k/v datums. It is 124 // only present when doing a physical check in order to verify 125 // ordering. 126 lastDatums tree.Datums 127 } 128 129 // FetcherTableArgs are the arguments passed to Fetcher.Init 130 // for a given table that includes descriptors and row information. 131 type FetcherTableArgs struct { 132 // The spans of keys to return for the given table. Fetcher 133 // ignores keys outside these spans. 134 // This is irrelevant if Fetcher is initialize with only one 135 // table. 136 Spans roachpb.Spans 137 Desc *sqlbase.ImmutableTableDescriptor 138 Index *sqlbase.IndexDescriptor 139 ColIdxMap map[sqlbase.ColumnID]int 140 IsSecondaryIndex bool 141 Cols []sqlbase.ColumnDescriptor 142 // The indexes (0 to # of columns - 1) of the columns to return. 143 ValNeededForCol util.FastIntSet 144 } 145 146 // Fetcher handles fetching kvs and forming table rows for an 147 // arbitrary number of tables. 148 // Usage: 149 // var rf Fetcher 150 // err := rf.Init(..) 151 // // Handle err 152 // err := rf.StartScan(..) 153 // // Handle err 154 // for { 155 // res, err := rf.NextRow() 156 // // Handle err 157 // if res.row == nil { 158 // // Done 159 // break 160 // } 161 // // Process res.row 162 // } 163 type Fetcher struct { 164 // codec is used to encode and decode sql keys. 165 codec keys.SQLCodec 166 167 // tables is a slice of all the tables and their descriptors for which 168 // rows are returned. 169 tables []tableInfo 170 171 // allEquivSignatures is a map used for checking if an equivalence 172 // signature belongs to any table or table's ancestor. It also maps the 173 // string representation of every table's and every table's ancestors' 174 // signature to the table's index in 'tables' for lookup during decoding. 175 // If 2+ tables share the same ancestor signature, allEquivSignatures 176 // will map the signature to the largest 'tables' index. 177 // The full signature for a given table in 'tables' will always map to 178 // its own index in 'tables'. 179 allEquivSignatures map[string]int 180 181 // reverse denotes whether or not the spans should be read in reverse 182 // or not when StartScan is invoked. 183 reverse bool 184 185 // maxKeysPerRow memoizes the maximum number of keys per row 186 // out of all the tables. This is used to calculate the kvBatchFetcher's 187 // firstBatchLimit. 188 maxKeysPerRow int 189 190 // True if the index key must be decoded. 191 // If there is more than one table, the index key must always be decoded. 192 // This is only false if there are no needed columns and the (single) 193 // table has no interleave children. 194 mustDecodeIndexKey bool 195 196 // lockStr represents the row-level locking mode to use when fetching rows. 197 lockStr sqlbase.ScanLockingStrength 198 199 // returnRangeInfo, if set, causes the underlying kvBatchFetcher to return 200 // information about the ranges descriptors/leases uses in servicing the 201 // requests. This has some cost, so it's only enabled by DistSQL when this 202 // info is actually useful for correcting the plan (e.g. not for the PK-side 203 // of an index-join). 204 // If set, GetRangesInfo() can be used to retrieve the accumulated info. 205 returnRangeInfo bool 206 207 // traceKV indicates whether or not session tracing is enabled. It is set 208 // when beginning a new scan. 209 traceKV bool 210 211 // -- Fields updated during a scan -- 212 213 kvFetcher *KVFetcher 214 indexKey []byte // the index key of the current row 215 prettyValueBuf *bytes.Buffer 216 217 valueColsFound int // how many needed cols we've found so far in the value 218 219 rowReadyTable *tableInfo // the table for which a row was fully decoded and ready for output 220 currentTable *tableInfo // the most recent table for which a key was decoded 221 keySigBuf []byte // buffer for the index key's signature 222 keyRestBuf []byte // buffer for the rest of the index key that is not part of the signature 223 224 // The current key/value, unless kvEnd is true. 225 kv roachpb.KeyValue 226 keyRemainingBytes []byte 227 kvEnd bool 228 229 // isCheck indicates whether or not we are running checks for k/v 230 // correctness. It is set only during SCRUB commands. 231 isCheck bool 232 233 // Buffered allocation of decoded datums. 234 alloc *sqlbase.DatumAlloc 235 } 236 237 // Reset resets this Fetcher, preserving the memory capacity that was used 238 // for the tables slice, and the slices within each of the tableInfo objects 239 // within tables. This permits reuse of this objects without forcing total 240 // reallocation of all of those slice fields. 241 func (rf *Fetcher) Reset() { 242 *rf = Fetcher{ 243 tables: rf.tables[:0], 244 } 245 } 246 247 // Init sets up a Fetcher for a given table and index. If we are using a 248 // non-primary index, tables.ValNeededForCol can only refer to columns in the 249 // index. 250 func (rf *Fetcher) Init( 251 codec keys.SQLCodec, 252 reverse bool, 253 lockStr sqlbase.ScanLockingStrength, 254 returnRangeInfo bool, 255 isCheck bool, 256 alloc *sqlbase.DatumAlloc, 257 tables ...FetcherTableArgs, 258 ) error { 259 if len(tables) == 0 { 260 return errors.AssertionFailedf("no tables to fetch from") 261 } 262 263 rf.codec = codec 264 rf.reverse = reverse 265 rf.lockStr = lockStr 266 rf.returnRangeInfo = returnRangeInfo 267 rf.alloc = alloc 268 rf.isCheck = isCheck 269 270 // We must always decode the index key if we need to distinguish between 271 // rows from more than one table. 272 nTables := len(tables) 273 multipleTables := nTables >= 2 274 rf.mustDecodeIndexKey = multipleTables 275 if multipleTables { 276 rf.allEquivSignatures = make(map[string]int, len(tables)) 277 } 278 279 if cap(rf.tables) >= nTables { 280 rf.tables = rf.tables[:nTables] 281 } else { 282 rf.tables = make([]tableInfo, nTables) 283 } 284 for tableIdx, tableArgs := range tables { 285 oldTable := rf.tables[tableIdx] 286 287 table := tableInfo{ 288 spans: tableArgs.Spans, 289 desc: tableArgs.Desc, 290 colIdxMap: tableArgs.ColIdxMap, 291 index: tableArgs.Index, 292 isSecondaryIndex: tableArgs.IsSecondaryIndex, 293 cols: tableArgs.Cols, 294 row: make(sqlbase.EncDatumRow, len(tableArgs.Cols)), 295 decodedRow: make(tree.Datums, len(tableArgs.Cols)), 296 297 // These slice fields might get re-allocated below, so reslice them from 298 // the old table here in case they've got enough capacity already. 299 indexColIdx: oldTable.indexColIdx[:0], 300 keyVals: oldTable.keyVals[:0], 301 extraVals: oldTable.extraVals[:0], 302 } 303 304 var err error 305 if multipleTables { 306 // We produce references to every signature's reference. 307 equivSignatures, err := sqlbase.TableEquivSignatures(table.desc.TableDesc(), table.index) 308 if err != nil { 309 return err 310 } 311 for i, sig := range equivSignatures { 312 // We always map the table's equivalence signature (last 313 // 'sig' in 'equivSignatures') to its tableIdx. 314 // This allows us to overwrite previous "ancestor 315 // signatures" (see below). 316 if i == len(equivSignatures)-1 { 317 rf.allEquivSignatures[string(sig)] = tableIdx 318 break 319 } 320 // Map each table's ancestors' signatures to -1 so 321 // we know during ReadIndexKey if the parsed index 322 // key belongs to ancestor or one of our tables. 323 // We must check if the signature has already been set 324 // since it's possible for a later 'table' to have an 325 // ancestor that is a previous 'table', and we do not 326 // want to overwrite the previous table's tableIdx. 327 if _, exists := rf.allEquivSignatures[string(sig)]; !exists { 328 rf.allEquivSignatures[string(sig)] = -1 329 } 330 } 331 // The last signature is the given table's equivalence signature. 332 table.equivSignature = equivSignatures[len(equivSignatures)-1] 333 } 334 335 // Scan through the entire columns map to see which columns are 336 // required. 337 for col, idx := range table.colIdxMap { 338 if tableArgs.ValNeededForCol.Contains(idx) { 339 // The idx-th column is required. 340 table.neededCols.Add(int(col)) 341 } 342 } 343 344 table.knownPrefixLength = len( 345 sqlbase.MakeIndexKeyPrefix(codec, table.desc.TableDesc(), table.index.ID), 346 ) 347 348 var indexColumnIDs []sqlbase.ColumnID 349 indexColumnIDs, table.indexColumnDirs = table.index.FullColumnIDs() 350 351 table.neededValueColsByIdx = tableArgs.ValNeededForCol.Copy() 352 neededIndexCols := 0 353 nIndexCols := len(indexColumnIDs) 354 if cap(table.indexColIdx) >= nIndexCols { 355 table.indexColIdx = table.indexColIdx[:nIndexCols] 356 } else { 357 table.indexColIdx = make([]int, nIndexCols) 358 } 359 for i, id := range indexColumnIDs { 360 colIdx, ok := table.colIdxMap[id] 361 if ok { 362 table.indexColIdx[i] = colIdx 363 if table.neededCols.Contains(int(id)) { 364 neededIndexCols++ 365 table.neededValueColsByIdx.Remove(colIdx) 366 } 367 } else { 368 table.indexColIdx[i] = -1 369 if table.neededCols.Contains(int(id)) { 370 return errors.AssertionFailedf("needed column %d not in colIdxMap", id) 371 } 372 } 373 } 374 375 // In order to track #40410 more effectively, check that the contents of 376 // table.neededValueColsByIdx are valid. 377 for idx, ok := table.neededValueColsByIdx.Next(0); ok; idx, ok = table.neededValueColsByIdx.Next(idx + 1) { 378 if idx >= len(table.row) || idx < 0 { 379 return errors.AssertionFailedf( 380 "neededValueColsByIdx contains an invalid index. column %d requested, but table has %d columns", 381 idx, 382 len(table.row), 383 ) 384 } 385 } 386 387 // - If there is more than one table, we have to decode the index key to 388 // figure out which table the row belongs to. 389 // - If there are interleaves, we need to read the index key in order to 390 // determine whether this row is actually part of the index we're scanning. 391 // - If there are needed columns from the index key, we need to read it. 392 // 393 // Otherwise, we can completely avoid decoding the index key. 394 if !rf.mustDecodeIndexKey && (neededIndexCols > 0 || len(table.index.InterleavedBy) > 0 || len(table.index.Interleave.Ancestors) > 0) { 395 rf.mustDecodeIndexKey = true 396 } 397 398 // The number of columns we need to read from the value part of the key. 399 // It's the total number of needed columns minus the ones we read from the 400 // index key, except for composite columns. 401 table.neededValueCols = table.neededCols.Len() - neededIndexCols + len(table.index.CompositeColumnIDs) 402 403 if table.isSecondaryIndex { 404 for i := range table.cols { 405 if table.neededCols.Contains(int(table.cols[i].ID)) && !table.index.ContainsColumnID(table.cols[i].ID) { 406 return errors.Errorf("requested column %s not in index", table.cols[i].Name) 407 } 408 } 409 } 410 411 // Prepare our index key vals slice. 412 table.keyValTypes, err = sqlbase.GetColumnTypes(table.desc.TableDesc(), indexColumnIDs) 413 if err != nil { 414 return err 415 } 416 if cap(table.keyVals) >= nIndexCols { 417 table.keyVals = table.keyVals[:nIndexCols] 418 } else { 419 table.keyVals = make([]sqlbase.EncDatum, nIndexCols) 420 } 421 422 if hasExtraCols(&table) { 423 // Unique secondary indexes have a value that is the 424 // primary index key. 425 // Primary indexes only contain ascendingly-encoded 426 // values. If this ever changes, we'll probably have to 427 // figure out the directions here too. 428 table.extraTypes, err = sqlbase.GetColumnTypes(table.desc.TableDesc(), table.index.ExtraColumnIDs) 429 nExtraColumns := len(table.index.ExtraColumnIDs) 430 if cap(table.extraVals) >= nExtraColumns { 431 table.extraVals = table.extraVals[:nExtraColumns] 432 } else { 433 table.extraVals = make([]sqlbase.EncDatum, nExtraColumns) 434 } 435 if err != nil { 436 return err 437 } 438 } 439 440 // Keep track of the maximum keys per row to accommodate a 441 // limitHint when StartScan is invoked. 442 keysPerRow, err := table.desc.KeysPerRow(table.index.ID) 443 if err != nil { 444 return err 445 } 446 if keysPerRow > rf.maxKeysPerRow { 447 rf.maxKeysPerRow = keysPerRow 448 } 449 450 rf.tables[tableIdx] = table 451 } 452 453 if len(tables) == 1 { 454 // If there is more than one table, currentTable will be 455 // updated every time NextKey is invoked and rowReadyTable 456 // will be updated when a row is fully decoded. 457 rf.currentTable = &(rf.tables[0]) 458 rf.rowReadyTable = &(rf.tables[0]) 459 } 460 461 return nil 462 } 463 464 // StartScan initializes and starts the key-value scan. Can be used multiple 465 // times. 466 func (rf *Fetcher) StartScan( 467 ctx context.Context, 468 txn *kv.Txn, 469 spans roachpb.Spans, 470 limitBatches bool, 471 limitHint int64, 472 traceKV bool, 473 ) error { 474 if len(spans) == 0 { 475 return errors.AssertionFailedf("no spans") 476 } 477 478 rf.traceKV = traceKV 479 f, err := makeKVBatchFetcher( 480 txn, 481 spans, 482 rf.reverse, 483 limitBatches, 484 rf.firstBatchLimit(limitHint), 485 rf.lockStr, 486 rf.returnRangeInfo, 487 ) 488 if err != nil { 489 return err 490 } 491 return rf.StartScanFrom(ctx, &f) 492 } 493 494 // StartInconsistentScan initializes and starts an inconsistent scan, where each 495 // KV batch can be read at a different historical timestamp. 496 // 497 // The scan uses the initial timestamp, until it becomes older than 498 // maxTimestampAge; at this time the timestamp is bumped by the amount of time 499 // that has passed. See the documentation for TableReaderSpec for more 500 // details. 501 // 502 // Can be used multiple times. 503 func (rf *Fetcher) StartInconsistentScan( 504 ctx context.Context, 505 db *kv.DB, 506 initialTimestamp hlc.Timestamp, 507 maxTimestampAge time.Duration, 508 spans roachpb.Spans, 509 limitBatches bool, 510 limitHint int64, 511 traceKV bool, 512 ) error { 513 if len(spans) == 0 { 514 return errors.AssertionFailedf("no spans") 515 } 516 517 txnTimestamp := initialTimestamp 518 txnStartTime := timeutil.Now() 519 if txnStartTime.Sub(txnTimestamp.GoTime()) >= maxTimestampAge { 520 return errors.Errorf( 521 "AS OF SYSTEM TIME: cannot specify timestamp older than %s for this operation", 522 maxTimestampAge, 523 ) 524 } 525 txn := kv.NewTxnWithSteppingEnabled(ctx, db, 0 /* gatewayNodeID */) 526 txn.SetFixedTimestamp(ctx, txnTimestamp) 527 if log.V(1) { 528 log.Infof(ctx, "starting inconsistent scan at timestamp %v", txnTimestamp) 529 } 530 531 sendFn := func(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, error) { 532 if now := timeutil.Now(); now.Sub(txnTimestamp.GoTime()) >= maxTimestampAge { 533 // Time to bump the transaction. First commit the old one (should be a no-op). 534 if err := txn.Commit(ctx); err != nil { 535 return nil, err 536 } 537 // Advance the timestamp by the time that passed. 538 txnTimestamp = txnTimestamp.Add(now.Sub(txnStartTime).Nanoseconds(), 0 /* logical */) 539 txnStartTime = now 540 txn = kv.NewTxnWithSteppingEnabled(ctx, db, 0 /* gatewayNodeID */) 541 txn.SetFixedTimestamp(ctx, txnTimestamp) 542 543 if log.V(1) { 544 log.Infof(ctx, "bumped inconsistent scan timestamp to %v", txnTimestamp) 545 } 546 } 547 548 res, err := txn.Send(ctx, ba) 549 if err != nil { 550 return nil, err.GoError() 551 } 552 return res, nil 553 } 554 555 // TODO(radu): we should commit the last txn. Right now the commit is a no-op 556 // on read transactions, but perhaps one day it will release some resources. 557 558 rf.traceKV = traceKV 559 f, err := makeKVBatchFetcherWithSendFunc( 560 sendFunc(sendFn), 561 spans, 562 rf.reverse, 563 limitBatches, 564 rf.firstBatchLimit(limitHint), 565 rf.lockStr, 566 rf.returnRangeInfo, 567 ) 568 if err != nil { 569 return err 570 } 571 return rf.StartScanFrom(ctx, &f) 572 } 573 574 func (rf *Fetcher) firstBatchLimit(limitHint int64) int64 { 575 if limitHint == 0 { 576 return 0 577 } 578 // If we have a limit hint, we limit the first batch size. Subsequent 579 // batches get larger to avoid making things too slow (e.g. in case we have 580 // a very restrictive filter and actually have to retrieve a lot of rows). 581 // The limitHint is a row limit, but each row could be made up of more than 582 // one key. We take the maximum possible keys per row out of all the table 583 // rows we could potentially scan over. 584 // 585 // We add an extra key to make sure we form the last row. 586 return limitHint*int64(rf.maxKeysPerRow) + 1 587 } 588 589 // StartScanFrom initializes and starts a scan from the given kvBatchFetcher. Can be 590 // used multiple times. 591 func (rf *Fetcher) StartScanFrom(ctx context.Context, f kvBatchFetcher) error { 592 rf.indexKey = nil 593 rf.kvFetcher = newKVFetcher(f) 594 // Retrieve the first key. 595 _, err := rf.NextKey(ctx) 596 return err 597 } 598 599 // NextKey retrieves the next key/value and sets kv/kvEnd. Returns whether a row 600 // has been completed. 601 func (rf *Fetcher) NextKey(ctx context.Context) (rowDone bool, err error) { 602 var ok bool 603 604 for { 605 ok, rf.kv, _, err = rf.kvFetcher.NextKV(ctx) 606 if err != nil { 607 return false, err 608 } 609 rf.kvEnd = !ok 610 if rf.kvEnd { 611 // No more keys in the scan. We need to transition 612 // rf.rowReadyTable to rf.currentTable for the last 613 // row. 614 // 615 // NB: this assumes that the KV layer will never split a range 616 // between column families, which is a brittle assumption. 617 // See: 618 // https://github.com/cockroachdb/cockroach/pull/42056 619 rf.rowReadyTable = rf.currentTable 620 return true, nil 621 } 622 623 // foundNull is set when decoding a new index key for a row finds a NULL value 624 // in the index key. This is used when decoding unique secondary indexes in order 625 // to tell whether they have extra columns appended to the key. 626 var foundNull bool 627 628 // unchangedPrefix will be set to true if we can skip decoding the index key 629 // completely, because the last key we saw has identical prefix to the 630 // current key. 631 unchangedPrefix := rf.indexKey != nil && bytes.HasPrefix(rf.kv.Key, rf.indexKey) 632 if unchangedPrefix { 633 keySuffix := rf.kv.Key[len(rf.indexKey):] 634 if _, foundSentinel := encoding.DecodeIfInterleavedSentinel(keySuffix); foundSentinel { 635 // We found an interleaved sentinel, which means that the key we just 636 // found belongs to a different interleave. That means we have to go 637 // through with index key decoding. 638 unchangedPrefix = false 639 } else { 640 rf.keyRemainingBytes = keySuffix 641 } 642 } 643 // See Init() for a detailed description of when we can get away with not 644 // reading the index key. 645 if unchangedPrefix { 646 // Skip decoding! 647 // We must set the rowReadyTable to the currentTable like ReadIndexKey 648 // would do. This will happen when we see 2 rows in a row with the same 649 // prefix. If the previous prefix was from a different table, then we must 650 // update the ready table to the current table, updating the fetcher state 651 // machine to recognize that the next row that it outputs will be from 652 // rf.currentTable, which will be set to the table of the key that was 653 // last sent to ReadIndexKey. 654 // 655 // TODO(jordan): this is a major (but correct) mess. The fetcher is past 656 // due for a refactor, now that it's (more) clear what the state machine 657 // it's trying to model is. 658 rf.rowReadyTable = rf.currentTable 659 } else if rf.mustDecodeIndexKey || rf.traceKV { 660 rf.keyRemainingBytes, ok, foundNull, err = rf.ReadIndexKey(rf.kv.Key) 661 if err != nil { 662 return false, err 663 } 664 if !ok { 665 // The key did not match any of the table 666 // descriptors, which means it's interleaved 667 // data from some other table or index. 668 continue 669 } 670 } else { 671 // We still need to consume the key until the family 672 // id, so processKV can know whether we've finished a 673 // row or not. 674 prefixLen, err := keys.GetRowPrefixLength(rf.kv.Key) 675 if err != nil { 676 return false, err 677 } 678 679 rf.keyRemainingBytes = rf.kv.Key[prefixLen:] 680 } 681 682 // For unique secondary indexes, the index-key does not distinguish one row 683 // from the next if both rows contain identical values along with a NULL. 684 // Consider the keys: 685 // 686 // /test/unique_idx/NULL/0 687 // /test/unique_idx/NULL/1 688 // 689 // The index-key extracted from the above keys is /test/unique_idx/NULL. The 690 // trailing /0 and /1 are the primary key used to unique-ify the keys when a 691 // NULL is present. When a null is present in the index key, we cut off more 692 // of the index key so that the prefix includes the primary key columns. 693 // 694 // Note that we do not need to do this for non-unique secondary indexes because 695 // the extra columns in the primary key will _always_ be there, so we can decode 696 // them when processing the index. The difference with unique secondary indexes 697 // is that the extra columns are not always there, and are used to unique-ify 698 // the index key, rather than provide the primary key column values. 699 if foundNull && rf.currentTable.isSecondaryIndex && rf.currentTable.index.Unique && len(rf.currentTable.desc.Families) != 1 { 700 for range rf.currentTable.index.ExtraColumnIDs { 701 var err error 702 // Slice off an extra encoded column from rf.keyRemainingBytes. 703 rf.keyRemainingBytes, err = sqlbase.SkipTableKey(rf.keyRemainingBytes) 704 if err != nil { 705 return false, err 706 } 707 } 708 } 709 710 switch { 711 case len(rf.currentTable.desc.Families) == 1: 712 // If we only have one family, we know that there is only 1 k/v pair per row. 713 rowDone = true 714 case !unchangedPrefix: 715 // If the prefix of the key has changed, current key is from a different 716 // row than the previous one. 717 rowDone = true 718 case rf.rowReadyTable != rf.currentTable: 719 // For rowFetchers with more than one table, if the table changes the row 720 // is done. 721 rowDone = true 722 default: 723 rowDone = false 724 } 725 726 if rf.indexKey != nil && rowDone { 727 // The current key belongs to a new row. Output the 728 // current row. 729 rf.indexKey = nil 730 return true, nil 731 } 732 733 return false, nil 734 } 735 } 736 737 func (rf *Fetcher) prettyEncDatums(types []*types.T, vals []sqlbase.EncDatum) string { 738 var buf strings.Builder 739 for i, v := range vals { 740 if err := v.EnsureDecoded(types[i], rf.alloc); err != nil { 741 buf.WriteString("error decoding: ") 742 buf.WriteString(err.Error()) 743 } 744 buf.WriteByte('/') 745 buf.WriteString(v.Datum.String()) 746 } 747 return buf.String() 748 } 749 750 // ReadIndexKey decodes an index key for a given table. 751 // It returns whether or not the key is for any of the tables initialized 752 // in Fetcher, and the remaining part of the key if it is. 753 // ReadIndexKey additionally returns whether or not it encountered a null while decoding. 754 func (rf *Fetcher) ReadIndexKey( 755 key roachpb.Key, 756 ) (remaining []byte, ok bool, foundNull bool, err error) { 757 // If there is only one table to check keys for, there is no need 758 // to go through the equivalence signature checks. 759 if len(rf.tables) == 1 { 760 return sqlbase.DecodeIndexKeyWithoutTableIDIndexIDPrefix( 761 rf.currentTable.desc.TableDesc(), 762 rf.currentTable.index, 763 rf.currentTable.keyValTypes, 764 rf.currentTable.keyVals, 765 rf.currentTable.indexColumnDirs, 766 key[rf.currentTable.knownPrefixLength:], 767 ) 768 } 769 770 // Make a copy of the initial key for validating whether it's within 771 // the table's specified spans. 772 initialKey := key 773 774 // key now contains the bytes in the key (if match) that are not part 775 // of the signature in order. 776 tableIdx, key, match, err := sqlbase.IndexKeyEquivSignature(key, rf.allEquivSignatures, rf.keySigBuf, rf.keyRestBuf) 777 if err != nil { 778 return nil, false, false, err 779 } 780 // The index key does not belong to our table because either: 781 // !match: part of the index key's signature did not match any of 782 // rf.allEquivSignatures. 783 // tableIdx == -1: index key belongs to an ancestor. 784 if !match || tableIdx == -1 { 785 return nil, false, false, nil 786 } 787 788 // The index key is not within our specified span of keys for the 789 // particular table. 790 // TODO(richardwu): ContainsKey checks every span within spans. We 791 // can check that spans is ordered (or sort it) and memoize 792 // the last span we've checked for each table. We can pass in this 793 // information to ContainsKey as a hint for which span to start 794 // checking first. 795 if !rf.tables[tableIdx].spans.ContainsKey(initialKey) { 796 return nil, false, false, nil 797 } 798 799 // Either a new table is encountered or the rowReadyTable differs from 800 // the currentTable (the rowReadyTable was outputted in the previous 801 // read). We transition the references. 802 if &rf.tables[tableIdx] != rf.currentTable || rf.rowReadyTable != rf.currentTable { 803 rf.rowReadyTable = rf.currentTable 804 rf.currentTable = &rf.tables[tableIdx] 805 806 // rf.rowReadyTable is nil if this is the very first key. 807 // We want to ensure this does not differ from rf.currentTable 808 // to prevent another transition. 809 if rf.rowReadyTable == nil { 810 rf.rowReadyTable = rf.currentTable 811 } 812 } 813 814 // We can simply decode all the column values we retrieved 815 // when processing the ind 816 // ex key. The column values are at the 817 // front of the key. 818 if key, foundNull, err = sqlbase.DecodeKeyVals( 819 rf.currentTable.keyValTypes, 820 rf.currentTable.keyVals, 821 rf.currentTable.indexColumnDirs, 822 key, 823 ); err != nil { 824 return nil, false, false, err 825 } 826 827 return key, true, foundNull, nil 828 } 829 830 // processKV processes the given key/value, setting values in the row 831 // accordingly. If debugStrings is true, returns pretty printed key and value 832 // information in prettyKey/prettyValue (otherwise they are empty strings). 833 func (rf *Fetcher) processKV( 834 ctx context.Context, kv roachpb.KeyValue, 835 ) (prettyKey string, prettyValue string, err error) { 836 table := rf.currentTable 837 838 if rf.traceKV { 839 prettyKey = fmt.Sprintf( 840 "/%s/%s%s", 841 table.desc.Name, 842 table.index.Name, 843 rf.prettyEncDatums(table.keyValTypes, table.keyVals), 844 ) 845 } 846 847 // Either this is the first key of the fetch or the first key of a new 848 // row. 849 if rf.indexKey == nil { 850 // This is the first key for the row. 851 rf.indexKey = []byte(kv.Key[:len(kv.Key)-len(rf.keyRemainingBytes)]) 852 853 // Reset the row to nil; it will get filled in with the column 854 // values as we decode the key-value pairs for the row. 855 // We only need to reset the needed columns in the value component, because 856 // non-needed columns are never set and key columns are unconditionally set 857 // below. 858 for idx, ok := table.neededValueColsByIdx.Next(0); ok; idx, ok = table.neededValueColsByIdx.Next(idx + 1) { 859 table.row[idx].UnsetDatum() 860 } 861 862 // Fill in the column values that are part of the index key. 863 for i := range table.keyVals { 864 if idx := table.indexColIdx[i]; idx != -1 { 865 table.row[idx] = table.keyVals[i] 866 } 867 } 868 869 rf.valueColsFound = 0 870 871 // Reset the MVCC metadata for the next row. 872 873 // set rowLastModified to a sentinel that's before any real timestamp. 874 // As kvs are iterated for this row, it keeps track of the greatest 875 // timestamp seen. 876 table.rowLastModified = hlc.Timestamp{} 877 // All row encodings (both before and after column families) have a 878 // sentinel kv (column family 0) that is always present when a row is 879 // present, even if that row is all NULLs. Thus, a row is deleted if and 880 // only if the first kv in it a tombstone (RawBytes is empty). 881 table.rowIsDeleted = len(kv.Value.RawBytes) == 0 882 } 883 884 if table.rowLastModified.Less(kv.Value.Timestamp) { 885 table.rowLastModified = kv.Value.Timestamp 886 } 887 888 if table.neededCols.Empty() { 889 // We don't need to decode any values. 890 if rf.traceKV { 891 prettyValue = tree.DNull.String() 892 } 893 return prettyKey, prettyValue, nil 894 } 895 896 // For covering secondary indexes, allow for decoding as a primary key. 897 if table.index.GetEncodingType(table.desc.PrimaryIndex.ID) == sqlbase.PrimaryIndexEncoding && 898 len(rf.keyRemainingBytes) > 0 { 899 // If familyID is 0, kv.Value contains values for composite key columns. 900 // These columns already have a table.row value assigned above, but that value 901 // (obtained from the key encoding) might not be correct (e.g. for decimals, 902 // it might not contain the right number of trailing 0s; for collated 903 // strings, it is one of potentially many strings with the same collation 904 // key). 905 // 906 // In these cases, the correct value will be present in family 0 and the 907 // table.row value gets overwritten. 908 909 switch kv.Value.GetTag() { 910 case roachpb.ValueType_TUPLE: 911 // In this case, we don't need to decode the column family ID, because 912 // the ValueType_TUPLE encoding includes the column id with every encoded 913 // column value. 914 prettyKey, prettyValue, err = rf.processValueTuple(ctx, table, kv, prettyKey) 915 default: 916 var familyID uint64 917 _, familyID, err = encoding.DecodeUvarintAscending(rf.keyRemainingBytes) 918 if err != nil { 919 return "", "", scrub.WrapError(scrub.IndexKeyDecodingError, err) 920 } 921 922 var family *sqlbase.ColumnFamilyDescriptor 923 family, err = table.desc.FindFamilyByID(sqlbase.FamilyID(familyID)) 924 if err != nil { 925 return "", "", scrub.WrapError(scrub.IndexKeyDecodingError, err) 926 } 927 928 prettyKey, prettyValue, err = rf.processValueSingle(ctx, table, family, kv, prettyKey) 929 } 930 if err != nil { 931 return "", "", scrub.WrapError(scrub.IndexValueDecodingError, err) 932 } 933 } else { 934 tag := kv.Value.GetTag() 935 var valueBytes []byte 936 switch tag { 937 case roachpb.ValueType_BYTES: 938 // If we have the ValueType_BYTES on a secondary index, then we know we 939 // are looking at column family 0. Column family 0 stores the extra primary 940 // key columns if they are present, so we decode them here. 941 valueBytes, err = kv.Value.GetBytes() 942 if err != nil { 943 return "", "", scrub.WrapError(scrub.IndexValueDecodingError, err) 944 } 945 if hasExtraCols(table) { 946 // This is a unique secondary index; decode the extra 947 // column values from the value. 948 var err error 949 valueBytes, _, err = sqlbase.DecodeKeyVals( 950 table.extraTypes, 951 table.extraVals, 952 nil, 953 valueBytes, 954 ) 955 if err != nil { 956 return "", "", scrub.WrapError(scrub.SecondaryIndexKeyExtraValueDecodingError, err) 957 } 958 for i, id := range table.index.ExtraColumnIDs { 959 if table.neededCols.Contains(int(id)) { 960 table.row[table.colIdxMap[id]] = table.extraVals[i] 961 } 962 } 963 if rf.traceKV { 964 prettyValue = rf.prettyEncDatums(table.extraTypes, table.extraVals) 965 } 966 } 967 case roachpb.ValueType_TUPLE: 968 valueBytes, err = kv.Value.GetTuple() 969 if err != nil { 970 return "", "", scrub.WrapError(scrub.IndexValueDecodingError, err) 971 } 972 } 973 974 if DebugRowFetch { 975 if hasExtraCols(table) && tag == roachpb.ValueType_BYTES { 976 log.Infof(ctx, "Scan %s -> %s", kv.Key, rf.prettyEncDatums(table.extraTypes, table.extraVals)) 977 } else { 978 log.Infof(ctx, "Scan %s", kv.Key) 979 } 980 } 981 982 if len(valueBytes) > 0 { 983 prettyKey, prettyValue, err = rf.processValueBytes( 984 ctx, table, kv, valueBytes, prettyKey, 985 ) 986 if err != nil { 987 return "", "", scrub.WrapError(scrub.IndexValueDecodingError, err) 988 } 989 } 990 } 991 992 if rf.traceKV && prettyValue == "" { 993 prettyValue = tree.DNull.String() 994 } 995 996 return prettyKey, prettyValue, nil 997 } 998 999 // processValueSingle processes the given value (of column 1000 // family.DefaultColumnID), setting values in table.row accordingly. The key is 1001 // only used for logging. 1002 func (rf *Fetcher) processValueSingle( 1003 ctx context.Context, 1004 table *tableInfo, 1005 family *sqlbase.ColumnFamilyDescriptor, 1006 kv roachpb.KeyValue, 1007 prettyKeyPrefix string, 1008 ) (prettyKey string, prettyValue string, err error) { 1009 prettyKey = prettyKeyPrefix 1010 1011 // If this is the row sentinel (in the legacy pre-family format), 1012 // a value is not expected, so we're done. 1013 if family.ID == 0 { 1014 return "", "", nil 1015 } 1016 1017 colID := family.DefaultColumnID 1018 if colID == 0 { 1019 return "", "", errors.Errorf("single entry value with no default column id") 1020 } 1021 1022 if rf.traceKV || table.neededCols.Contains(int(colID)) { 1023 if idx, ok := table.colIdxMap[colID]; ok { 1024 if rf.traceKV { 1025 prettyKey = fmt.Sprintf("%s/%s", prettyKey, table.desc.DeletableColumns()[idx].Name) 1026 } 1027 if len(kv.Value.RawBytes) == 0 { 1028 return prettyKey, "", nil 1029 } 1030 typ := table.cols[idx].Type 1031 // TODO(arjun): The value is a directly marshaled single value, so we 1032 // unmarshal it eagerly here. This can potentially be optimized out, 1033 // although that would require changing UnmarshalColumnValue to operate 1034 // on bytes, and for Encode/DecodeTableValue to operate on marshaled 1035 // single values. 1036 value, err := sqlbase.UnmarshalColumnValue(rf.alloc, typ, kv.Value) 1037 if err != nil { 1038 return "", "", err 1039 } 1040 if rf.traceKV { 1041 prettyValue = value.String() 1042 } 1043 table.row[idx] = sqlbase.DatumToEncDatum(typ, value) 1044 if DebugRowFetch { 1045 log.Infof(ctx, "Scan %s -> %v", kv.Key, value) 1046 } 1047 return prettyKey, prettyValue, nil 1048 } 1049 } 1050 1051 // No need to unmarshal the column value. Either the column was part of 1052 // the index key or it isn't needed. 1053 if DebugRowFetch { 1054 log.Infof(ctx, "Scan %s -> [%d] (skipped)", kv.Key, colID) 1055 } 1056 return prettyKey, prettyValue, nil 1057 } 1058 1059 func (rf *Fetcher) processValueBytes( 1060 ctx context.Context, 1061 table *tableInfo, 1062 kv roachpb.KeyValue, 1063 valueBytes []byte, 1064 prettyKeyPrefix string, 1065 ) (prettyKey string, prettyValue string, err error) { 1066 prettyKey = prettyKeyPrefix 1067 if rf.traceKV { 1068 if rf.prettyValueBuf == nil { 1069 rf.prettyValueBuf = &bytes.Buffer{} 1070 } 1071 rf.prettyValueBuf.Reset() 1072 } 1073 1074 var colIDDiff uint32 1075 var lastColID sqlbase.ColumnID 1076 var typeOffset, dataOffset int 1077 var typ encoding.Type 1078 for len(valueBytes) > 0 && rf.valueColsFound < table.neededValueCols { 1079 typeOffset, dataOffset, colIDDiff, typ, err = encoding.DecodeValueTag(valueBytes) 1080 if err != nil { 1081 return "", "", err 1082 } 1083 colID := lastColID + sqlbase.ColumnID(colIDDiff) 1084 lastColID = colID 1085 if !table.neededCols.Contains(int(colID)) { 1086 // This column wasn't requested, so read its length and skip it. 1087 len, err := encoding.PeekValueLengthWithOffsetsAndType(valueBytes, dataOffset, typ) 1088 if err != nil { 1089 return "", "", err 1090 } 1091 valueBytes = valueBytes[len:] 1092 if DebugRowFetch { 1093 log.Infof(ctx, "Scan %s -> [%d] (skipped)", kv.Key, colID) 1094 } 1095 continue 1096 } 1097 idx := table.colIdxMap[colID] 1098 1099 if rf.traceKV { 1100 prettyKey = fmt.Sprintf("%s/%s", prettyKey, table.desc.DeletableColumns()[idx].Name) 1101 } 1102 1103 var encValue sqlbase.EncDatum 1104 encValue, valueBytes, err = sqlbase.EncDatumValueFromBufferWithOffsetsAndType(valueBytes, typeOffset, 1105 dataOffset, typ) 1106 if err != nil { 1107 return "", "", err 1108 } 1109 if rf.traceKV { 1110 err := encValue.EnsureDecoded(table.cols[idx].Type, rf.alloc) 1111 if err != nil { 1112 return "", "", err 1113 } 1114 fmt.Fprintf(rf.prettyValueBuf, "/%v", encValue.Datum) 1115 } 1116 table.row[idx] = encValue 1117 rf.valueColsFound++ 1118 if DebugRowFetch { 1119 log.Infof(ctx, "Scan %d -> %v", idx, encValue) 1120 } 1121 } 1122 if rf.traceKV { 1123 prettyValue = rf.prettyValueBuf.String() 1124 } 1125 return prettyKey, prettyValue, nil 1126 } 1127 1128 // processValueTuple processes the given values (of columns family.ColumnIDs), 1129 // setting values in the rf.row accordingly. The key is only used for logging. 1130 func (rf *Fetcher) processValueTuple( 1131 ctx context.Context, table *tableInfo, kv roachpb.KeyValue, prettyKeyPrefix string, 1132 ) (prettyKey string, prettyValue string, err error) { 1133 tupleBytes, err := kv.Value.GetTuple() 1134 if err != nil { 1135 return "", "", err 1136 } 1137 return rf.processValueBytes(ctx, table, kv, tupleBytes, prettyKeyPrefix) 1138 } 1139 1140 // NextRow processes keys until we complete one row, which is returned as an 1141 // EncDatumRow. The row contains one value per table column, regardless of the 1142 // index used; values that are not needed (as per neededCols) are nil. The 1143 // EncDatumRow should not be modified and is only valid until the next call. 1144 // When there are no more rows, the EncDatumRow is nil. The error returned may 1145 // be a scrub.ScrubError, which the caller is responsible for unwrapping. 1146 // It also returns the table and index descriptor associated with the row 1147 // (relevant when more than one table is specified during initialization). 1148 func (rf *Fetcher) NextRow( 1149 ctx context.Context, 1150 ) ( 1151 row sqlbase.EncDatumRow, 1152 table *sqlbase.TableDescriptor, 1153 index *sqlbase.IndexDescriptor, 1154 err error, 1155 ) { 1156 if rf.kvEnd { 1157 return nil, nil, nil, nil 1158 } 1159 1160 // All of the columns for a particular row will be grouped together. We 1161 // loop over the key/value pairs and decode the key to extract the 1162 // columns encoded within the key and the column ID. We use the column 1163 // ID to lookup the column and decode the value. All of these values go 1164 // into a map keyed by column name. When the index key changes we 1165 // output a row containing the current values. 1166 for { 1167 prettyKey, prettyVal, err := rf.processKV(ctx, rf.kv) 1168 if err != nil { 1169 return nil, nil, nil, err 1170 } 1171 if rf.traceKV { 1172 log.VEventf(ctx, 2, "fetched: %s -> %s", prettyKey, prettyVal) 1173 } 1174 1175 if rf.isCheck { 1176 rf.rowReadyTable.lastKV = rf.kv 1177 } 1178 rowDone, err := rf.NextKey(ctx) 1179 if err != nil { 1180 return nil, nil, nil, err 1181 } 1182 if rowDone { 1183 err := rf.finalizeRow() 1184 return rf.rowReadyTable.row, rf.rowReadyTable.desc.TableDesc(), rf.rowReadyTable.index, err 1185 } 1186 } 1187 } 1188 1189 // NextRowDecoded calls NextRow and decodes the EncDatumRow into a Datums. 1190 // The Datums should not be modified and is only valid until the next call. 1191 // When there are no more rows, the Datums is nil. 1192 // It also returns the table and index descriptor associated with the row 1193 // (relevant when more than one table is specified during initialization). 1194 func (rf *Fetcher) NextRowDecoded( 1195 ctx context.Context, 1196 ) ( 1197 datums tree.Datums, 1198 table *sqlbase.TableDescriptor, 1199 index *sqlbase.IndexDescriptor, 1200 err error, 1201 ) { 1202 row, table, index, err := rf.NextRow(ctx) 1203 if err != nil { 1204 err = scrub.UnwrapScrubError(err) 1205 return nil, nil, nil, err 1206 } 1207 if row == nil { 1208 return nil, nil, nil, nil 1209 } 1210 1211 for i, encDatum := range row { 1212 if encDatum.IsUnset() { 1213 rf.rowReadyTable.decodedRow[i] = tree.DNull 1214 continue 1215 } 1216 if err := encDatum.EnsureDecoded(rf.rowReadyTable.cols[i].Type, rf.alloc); err != nil { 1217 return nil, nil, nil, err 1218 } 1219 rf.rowReadyTable.decodedRow[i] = encDatum.Datum 1220 } 1221 1222 return rf.rowReadyTable.decodedRow, table, index, nil 1223 } 1224 1225 // RowLastModified may only be called after NextRow has returned a non-nil row 1226 // and returns the timestamp of the last modification to that row. 1227 func (rf *Fetcher) RowLastModified() hlc.Timestamp { 1228 return rf.rowReadyTable.rowLastModified 1229 } 1230 1231 // RowIsDeleted may only be called after NextRow has returned a non-nil row and 1232 // returns true if that row was most recently deleted. This method is only 1233 // meaningful when the configured kvBatchFetcher returns deletion tombstones, which 1234 // the normal one (via `StartScan`) does not. 1235 func (rf *Fetcher) RowIsDeleted() bool { 1236 return rf.rowReadyTable.rowIsDeleted 1237 } 1238 1239 // NextRowWithErrors calls NextRow to fetch the next row and also run 1240 // additional additional logic for physical checks. The Datums should 1241 // not be modified and are only valid until the next call. When there 1242 // are no more rows, the Datums is nil. The checks executed include: 1243 // - k/v data round-trips, i.e. it decodes and re-encodes to the same 1244 // value. 1245 // - There is no extra unexpected or incorrect data encoded in the k/v 1246 // pair. 1247 // - Decoded keys follow the same ordering as their encoding. 1248 func (rf *Fetcher) NextRowWithErrors(ctx context.Context) (sqlbase.EncDatumRow, error) { 1249 row, table, index, err := rf.NextRow(ctx) 1250 if row == nil { 1251 return nil, nil 1252 } else if err != nil { 1253 // If this is not already a wrapped error, we will consider it to be 1254 // a generic physical error. 1255 // FIXME(joey): This may not be needed if we capture all the errors 1256 // encountered. This is a TBD when this change is polished. 1257 if !scrub.IsScrubError(err) { 1258 err = scrub.WrapError(scrub.PhysicalError, err) 1259 } 1260 return row, err 1261 } 1262 1263 // Decode the row in-place. The following check datum encoding 1264 // functions require that the table.row datums are decoded. 1265 for i := range row { 1266 if row[i].IsUnset() { 1267 rf.rowReadyTable.decodedRow[i] = tree.DNull 1268 continue 1269 } 1270 if err := row[i].EnsureDecoded(rf.rowReadyTable.cols[i].Type, rf.alloc); err != nil { 1271 return nil, err 1272 } 1273 rf.rowReadyTable.decodedRow[i] = row[i].Datum 1274 } 1275 1276 if index.ID == table.PrimaryIndex.ID { 1277 err = rf.checkPrimaryIndexDatumEncodings(ctx) 1278 } else { 1279 err = rf.checkSecondaryIndexDatumEncodings(ctx) 1280 } 1281 if err != nil { 1282 return row, err 1283 } 1284 1285 err = rf.checkKeyOrdering(ctx) 1286 1287 return row, err 1288 } 1289 1290 // checkPrimaryIndexDatumEncodings will run a round-trip encoding check 1291 // on all values in the buffered row. This check is specific to primary 1292 // index datums. 1293 func (rf *Fetcher) checkPrimaryIndexDatumEncodings(ctx context.Context) error { 1294 table := rf.rowReadyTable 1295 scratch := make([]byte, 1024) 1296 colIDToColumn := make(map[sqlbase.ColumnID]*sqlbase.ColumnDescriptor) 1297 for i := range table.desc.Columns { 1298 col := &table.desc.Columns[i] 1299 colIDToColumn[col.ID] = col 1300 } 1301 1302 rh := rowHelper{TableDesc: table.desc, Indexes: table.desc.Indexes} 1303 1304 for i := range table.desc.Families { 1305 var lastColID sqlbase.ColumnID 1306 familyID := table.desc.Families[i].ID 1307 familySortedColumnIDs, ok := rh.sortedColumnFamily(familyID) 1308 if !ok { 1309 return errors.AssertionFailedf("invalid family sorted column id map for family %d", familyID) 1310 } 1311 1312 for _, colID := range familySortedColumnIDs { 1313 rowVal := table.row[table.colIdxMap[colID]] 1314 if rowVal.IsNull() { 1315 // Column is not present. 1316 continue 1317 } 1318 1319 if skip, err := rh.skipColumnInPK(colID, familyID, rowVal.Datum); err != nil { 1320 return errors.NewAssertionErrorWithWrappedErrf(err, "unable to determine skip") 1321 } else if skip { 1322 continue 1323 } 1324 1325 col := colIDToColumn[colID] 1326 if col == nil { 1327 return errors.AssertionFailedf("column mapping not found for column %d", colID) 1328 } 1329 1330 if lastColID > col.ID { 1331 return errors.AssertionFailedf("cannot write column id %d after %d", col.ID, lastColID) 1332 } 1333 colIDDiff := col.ID - lastColID 1334 lastColID = col.ID 1335 1336 if result, err := sqlbase.EncodeTableValue([]byte(nil), colIDDiff, rowVal.Datum, 1337 scratch); err != nil { 1338 return errors.NewAssertionErrorWithWrappedErrf(err, "could not re-encode column %s, value was %#v", 1339 col.Name, rowVal.Datum) 1340 } else if !rowVal.BytesEqual(result) { 1341 return scrub.WrapError(scrub.IndexValueDecodingError, errors.Errorf( 1342 "value failed to round-trip encode. Column=%s colIDDiff=%d Key=%s expected %#v, got: %#v", 1343 col.Name, colIDDiff, rf.kv.Key, rowVal.EncodedString(), result)) 1344 } 1345 } 1346 } 1347 return nil 1348 } 1349 1350 // checkSecondaryIndexDatumEncodings will run a round-trip encoding 1351 // check on all values in the buffered row. This check is specific to 1352 // secondary index datums. 1353 func (rf *Fetcher) checkSecondaryIndexDatumEncodings(ctx context.Context) error { 1354 table := rf.rowReadyTable 1355 colToEncDatum := make(map[sqlbase.ColumnID]sqlbase.EncDatum, len(table.row)) 1356 values := make(tree.Datums, len(table.row)) 1357 for i, col := range table.cols { 1358 colToEncDatum[col.ID] = table.row[i] 1359 values[i] = table.row[i].Datum 1360 } 1361 1362 // The below code makes incorrect checks (#45256). 1363 indexEntries, err := sqlbase.EncodeSecondaryIndex( 1364 rf.codec, table.desc.TableDesc(), table.index, table.colIdxMap, values, false /* includeEmpty */) 1365 if err != nil { 1366 return err 1367 } 1368 1369 for _, indexEntry := range indexEntries { 1370 // We ignore the first 4 bytes of the values. These bytes are a 1371 // checksum which are not set by EncodeSecondaryIndex. 1372 if !indexEntry.Key.Equal(rf.rowReadyTable.lastKV.Key) { 1373 return scrub.WrapError(scrub.IndexKeyDecodingError, errors.Errorf( 1374 "secondary index key failed to round-trip encode. expected %#v, got: %#v", 1375 rf.rowReadyTable.lastKV.Key, indexEntry.Key)) 1376 } else if !indexEntry.Value.EqualData(table.lastKV.Value) { 1377 return scrub.WrapError(scrub.IndexValueDecodingError, errors.Errorf( 1378 "secondary index value failed to round-trip encode. expected %#v, got: %#v", 1379 rf.rowReadyTable.lastKV.Value, indexEntry.Value)) 1380 } 1381 } 1382 return nil 1383 } 1384 1385 // checkKeyOrdering verifies that the datums decoded for the current key 1386 // have the same ordering as the encoded key. 1387 func (rf *Fetcher) checkKeyOrdering(ctx context.Context) error { 1388 defer func() { 1389 rf.rowReadyTable.lastDatums = append(tree.Datums(nil), rf.rowReadyTable.decodedRow...) 1390 }() 1391 1392 if !rf.rowReadyTable.hasLast { 1393 rf.rowReadyTable.hasLast = true 1394 return nil 1395 } 1396 1397 evalCtx := tree.EvalContext{} 1398 // Iterate through columns in order, comparing each value to the value in the 1399 // previous row in that column. When the first column with a differing value 1400 // is found, compare the values to ensure the ordering matches the column 1401 // ordering. 1402 for i, id := range rf.rowReadyTable.index.ColumnIDs { 1403 idx := rf.rowReadyTable.colIdxMap[id] 1404 result := rf.rowReadyTable.decodedRow[idx].Compare(&evalCtx, rf.rowReadyTable.lastDatums[idx]) 1405 expectedDirection := rf.rowReadyTable.index.ColumnDirections[i] 1406 if rf.reverse && expectedDirection == sqlbase.IndexDescriptor_ASC { 1407 expectedDirection = sqlbase.IndexDescriptor_DESC 1408 } else if rf.reverse && expectedDirection == sqlbase.IndexDescriptor_DESC { 1409 expectedDirection = sqlbase.IndexDescriptor_ASC 1410 } 1411 1412 if result != 0 { 1413 if expectedDirection == sqlbase.IndexDescriptor_ASC && result < 0 || 1414 expectedDirection == sqlbase.IndexDescriptor_DESC && result > 0 { 1415 return scrub.WrapError(scrub.IndexKeyDecodingError, 1416 errors.Errorf("key ordering did not match datum ordering. IndexDescriptor=%s", 1417 expectedDirection)) 1418 } 1419 // After the first column with a differing value is found, the remaining 1420 // columns are skipped (see #32874). 1421 break 1422 } 1423 } 1424 return nil 1425 } 1426 1427 func (rf *Fetcher) finalizeRow() error { 1428 table := rf.rowReadyTable 1429 // Fill in any missing values with NULLs 1430 for i := range table.cols { 1431 if rf.valueColsFound == table.neededValueCols { 1432 // Found all cols - done! 1433 return nil 1434 } 1435 if table.neededCols.Contains(int(table.cols[i].ID)) && table.row[i].IsUnset() { 1436 // If the row was deleted, we'll be missing any non-primary key 1437 // columns, including nullable ones, but this is expected. 1438 if !table.cols[i].Nullable && !table.rowIsDeleted { 1439 var indexColValues []string 1440 for _, idx := range table.indexColIdx { 1441 if idx != -1 { 1442 indexColValues = append(indexColValues, table.row[idx].String(table.cols[idx].Type)) 1443 } else { 1444 indexColValues = append(indexColValues, "?") 1445 } 1446 } 1447 err := errors.AssertionFailedf( 1448 "Non-nullable column \"%s:%s\" with no value! Index scanned was %q with the index key columns (%s) and the values (%s)", 1449 table.desc.Name, table.cols[i].Name, table.index.Name, 1450 strings.Join(table.index.ColumnNames, ","), strings.Join(indexColValues, ",")) 1451 1452 if rf.isCheck { 1453 return scrub.WrapError(scrub.UnexpectedNullValueError, err) 1454 } 1455 return err 1456 } 1457 table.row[i] = sqlbase.EncDatum{ 1458 Datum: tree.DNull, 1459 } 1460 // We've set valueColsFound to the number of present columns in the row 1461 // already, in processValueBytes. Now, we're filling in columns that have 1462 // no encoded values with NULL - so we increment valueColsFound to permit 1463 // early exit from this loop once all needed columns are filled in. 1464 rf.valueColsFound++ 1465 } 1466 } 1467 return nil 1468 } 1469 1470 // Key returns the next key (the key that follows the last returned row). 1471 // Key returns nil when there are no more rows. 1472 func (rf *Fetcher) Key() roachpb.Key { 1473 return rf.kv.Key 1474 } 1475 1476 // PartialKey returns a partial slice of the next key (the key that follows the 1477 // last returned row) containing nCols columns, without the ending column 1478 // family. Returns nil when there are no more rows. 1479 func (rf *Fetcher) PartialKey(nCols int) (roachpb.Key, error) { 1480 if rf.kv.Key == nil { 1481 return nil, nil 1482 } 1483 n, err := consumeIndexKeyWithoutTableIDIndexIDPrefix( 1484 rf.currentTable.index, nCols, rf.kv.Key[rf.currentTable.knownPrefixLength:]) 1485 if err != nil { 1486 return nil, err 1487 } 1488 return rf.kv.Key[:n+rf.currentTable.knownPrefixLength], nil 1489 } 1490 1491 // GetRangesInfo returns information about the ranges where the rows came from. 1492 // The RangeInfo's are deduped and not ordered. 1493 func (rf *Fetcher) GetRangesInfo() []roachpb.RangeInfo { 1494 f := rf.kvFetcher 1495 if f == nil { 1496 // Not yet initialized. 1497 return nil 1498 } 1499 return f.GetRangesInfo() 1500 } 1501 1502 // GetBytesRead returns total number of bytes read by the underlying KVFetcher. 1503 func (rf *Fetcher) GetBytesRead() int64 { 1504 f := rf.kvFetcher 1505 if f == nil { 1506 // Not yet initialized. 1507 return 0 1508 } 1509 return f.bytesRead 1510 } 1511 1512 // Only unique secondary indexes have extra columns to decode (namely the 1513 // primary index columns). 1514 func hasExtraCols(table *tableInfo) bool { 1515 return table.isSecondaryIndex && table.index.Unique 1516 } 1517 1518 // consumeIndexKeyWithoutTableIDIndexIDPrefix consumes an index key that's 1519 // already pre-stripped of its table ID index ID prefix, up to nCols columns, 1520 // returning the number of bytes consumed. For example, given an input key 1521 // with values (6,7,8,9) such as /Table/60/1/6/7/#/61/1/8/9, stripping 3 columns 1522 // from this key would eat all but the final, 4th column 9 in this example, 1523 // producing /Table/60/1/6/7/#/61/1/8. If nCols was 2, instead, the result 1524 // would include the trailing table ID index ID pair, since that's a more 1525 // precise key: /Table/60/1/6/7/#/61/1. 1526 func consumeIndexKeyWithoutTableIDIndexIDPrefix( 1527 index *sqlbase.IndexDescriptor, nCols int, key []byte, 1528 ) (int, error) { 1529 origKeyLen := len(key) 1530 consumedCols := 0 1531 for _, ancestor := range index.Interleave.Ancestors { 1532 length := int(ancestor.SharedPrefixLen) 1533 // Skip up to length values. 1534 for j := 0; j < length; j++ { 1535 if consumedCols == nCols { 1536 // We're done early, in the middle of an interleave. 1537 return origKeyLen - len(key), nil 1538 } 1539 l, err := encoding.PeekLength(key) 1540 if err != nil { 1541 return 0, err 1542 } 1543 key = key[l:] 1544 consumedCols++ 1545 } 1546 var ok bool 1547 key, ok = encoding.DecodeIfInterleavedSentinel(key) 1548 if !ok { 1549 return 0, errors.New("unexpected lack of sentinel key") 1550 } 1551 1552 // Skip the TableID/IndexID pair for each ancestor except for the 1553 // first, which has already been skipped in our input. 1554 for j := 0; j < 2; j++ { 1555 idLen, err := encoding.PeekLength(key) 1556 if err != nil { 1557 return 0, err 1558 } 1559 key = key[idLen:] 1560 } 1561 } 1562 1563 // Decode the remaining values in the key, in the final interleave. 1564 for ; consumedCols < nCols; consumedCols++ { 1565 l, err := encoding.PeekLength(key) 1566 if err != nil { 1567 return 0, err 1568 } 1569 key = key[l:] 1570 } 1571 1572 return origKeyLen - len(key), nil 1573 }