github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/sqlbase/index_encoding.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package sqlbase 12 13 import ( 14 "context" 15 "fmt" 16 "sort" 17 18 "github.com/cockroachdb/cockroach/pkg/geo/geoindex" 19 "github.com/cockroachdb/cockroach/pkg/keys" 20 "github.com/cockroachdb/cockroach/pkg/kv" 21 "github.com/cockroachdb/cockroach/pkg/roachpb" 22 "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" 23 "github.com/cockroachdb/cockroach/pkg/sql/types" 24 "github.com/cockroachdb/cockroach/pkg/util" 25 "github.com/cockroachdb/cockroach/pkg/util/encoding" 26 "github.com/cockroachdb/cockroach/pkg/util/json" 27 "github.com/cockroachdb/cockroach/pkg/util/unique" 28 "github.com/cockroachdb/errors" 29 ) 30 31 // This file contains facilities to encode primary and secondary 32 // indexes on SQL tables. 33 34 // MakeIndexKeyPrefix returns the key prefix used for the index's data. If you 35 // need the corresponding Span, prefer desc.IndexSpan(indexID) or 36 // desc.PrimaryIndexSpan(). 37 func MakeIndexKeyPrefix(codec keys.SQLCodec, desc *TableDescriptor, indexID IndexID) []byte { 38 if i, err := desc.FindIndexByID(indexID); err == nil && len(i.Interleave.Ancestors) > 0 { 39 ancestor := &i.Interleave.Ancestors[0] 40 return codec.IndexPrefix(uint32(ancestor.TableID), uint32(ancestor.IndexID)) 41 } 42 return codec.IndexPrefix(uint32(desc.ID), uint32(indexID)) 43 } 44 45 // EncodeIndexKey creates a key by concatenating keyPrefix with the 46 // encodings of the columns in the index, and returns the key and 47 // whether any of the encoded values were NULLs. 48 // 49 // If a table or index is interleaved, `encoding.interleavedSentinel` 50 // is used in place of the family id (a varint) to signal the next 51 // component of the key. An example of one level of interleaving (a 52 // parent): 53 // /<parent_table_id>/<parent_index_id>/<field_1>/<field_2>/NullDesc/<table_id>/<index_id>/<field_3>/<family> 54 // 55 // Note that ExtraColumnIDs are not encoded, so the result isn't always a 56 // full index key. 57 func EncodeIndexKey( 58 tableDesc *TableDescriptor, 59 index *IndexDescriptor, 60 colMap map[ColumnID]int, 61 values []tree.Datum, 62 keyPrefix []byte, 63 ) (key []byte, containsNull bool, err error) { 64 return EncodePartialIndexKey( 65 tableDesc, 66 index, 67 len(index.ColumnIDs), /* encode all columns */ 68 colMap, 69 values, 70 keyPrefix, 71 ) 72 } 73 74 // EncodePartialIndexSpan creates the minimal key span for the key specified by the 75 // given table, index, and values, with the same method as 76 // EncodePartialIndexKey. 77 func EncodePartialIndexSpan( 78 tableDesc *TableDescriptor, 79 index *IndexDescriptor, 80 numCols int, 81 colMap map[ColumnID]int, 82 values []tree.Datum, 83 keyPrefix []byte, 84 ) (span roachpb.Span, containsNull bool, err error) { 85 var key roachpb.Key 86 var endKey roachpb.Key 87 key, containsNull, err = EncodePartialIndexKey(tableDesc, index, numCols, colMap, values, keyPrefix) 88 if err != nil { 89 return span, containsNull, err 90 } 91 if numCols == len(index.ColumnIDs) { 92 // If all values in the input index were specified, append an interleave 93 // marker instead of PrefixEnding the key, to avoid including any child 94 // interleaves of the input key. 95 endKey = encoding.EncodeInterleavedSentinel(key) 96 } else { 97 endKey = key.PrefixEnd() 98 } 99 return roachpb.Span{Key: key, EndKey: endKey}, containsNull, nil 100 } 101 102 // EncodePartialIndexKey encodes a partial index key; only the first numCols of 103 // the index key columns are encoded. The index key columns are 104 // - index.ColumnIDs for unique indexes, and 105 // - append(index.ColumnIDs, index.ExtraColumnIDs) for non-unique indexes. 106 func EncodePartialIndexKey( 107 tableDesc *TableDescriptor, 108 index *IndexDescriptor, 109 numCols int, 110 colMap map[ColumnID]int, 111 values []tree.Datum, 112 keyPrefix []byte, 113 ) (key []byte, containsNull bool, err error) { 114 var colIDs, extraColIDs []ColumnID 115 if numCols <= len(index.ColumnIDs) { 116 colIDs = index.ColumnIDs[:numCols] 117 } else { 118 if index.Unique || numCols > len(index.ColumnIDs)+len(index.ExtraColumnIDs) { 119 return nil, false, errors.Errorf("encoding too many columns (%d)", numCols) 120 } 121 colIDs = index.ColumnIDs 122 extraColIDs = index.ExtraColumnIDs[:numCols-len(index.ColumnIDs)] 123 } 124 125 // We know we will append to the key which will cause the capacity to grow so 126 // make it bigger from the get-go. 127 // Add twice the key prefix as an initial guess. 128 // Add 3 bytes for every ancestor: table,index id + interleave sentinel. 129 // Add 2 bytes for every column value. An underestimate for all but low integers. 130 key = make([]byte, len(keyPrefix), 2*len(keyPrefix)+3*len(index.Interleave.Ancestors)+2*len(values)) 131 copy(key, keyPrefix) 132 133 dirs := directions(index.ColumnDirections) 134 135 if len(index.Interleave.Ancestors) > 0 { 136 for i, ancestor := range index.Interleave.Ancestors { 137 // The first ancestor is assumed to already be encoded in keyPrefix. 138 if i != 0 { 139 key = EncodePartialTableIDIndexID(key, ancestor.TableID, ancestor.IndexID) 140 } 141 142 partial := false 143 length := int(ancestor.SharedPrefixLen) 144 if length > len(colIDs) { 145 length = len(colIDs) 146 partial = true 147 } 148 var n bool 149 key, n, err = EncodeColumns(colIDs[:length], dirs[:length], colMap, values, key) 150 if err != nil { 151 return nil, false, err 152 } 153 containsNull = containsNull || n 154 if partial { 155 // Early stop. Note that if we had exactly SharedPrefixLen columns 156 // remaining, we want to append the next tableID/indexID pair because 157 // that results in a more specific key. 158 return key, containsNull, nil 159 } 160 colIDs, dirs = colIDs[length:], dirs[length:] 161 // Each ancestor is separated by an interleaved 162 // sentinel (0xfe). 163 key = encoding.EncodeInterleavedSentinel(key) 164 } 165 166 key = EncodePartialTableIDIndexID(key, tableDesc.ID, index.ID) 167 } 168 169 var n bool 170 key, n, err = EncodeColumns(colIDs, dirs, colMap, values, key) 171 if err != nil { 172 return nil, false, err 173 } 174 containsNull = containsNull || n 175 176 key, n, err = EncodeColumns(extraColIDs, nil /* directions */, colMap, values, key) 177 if err != nil { 178 return nil, false, err 179 } 180 containsNull = containsNull || n 181 return key, containsNull, nil 182 } 183 184 type directions []IndexDescriptor_Direction 185 186 func (d directions) get(i int) (encoding.Direction, error) { 187 if i < len(d) { 188 return d[i].ToEncodingDirection() 189 } 190 return encoding.Ascending, nil 191 } 192 193 // MakeSpanFromEncDatums creates a minimal index key span on the input 194 // values. A minimal index key span is a span that includes the fewest possible 195 // keys after the start key generated by the input values. 196 // 197 // The start key is generated by concatenating keyPrefix with the encodings of 198 // the given EncDatum values. The values, types, and dirs parameters should be 199 // specified in the same order as the index key columns and may be a prefix. 200 // 201 // If a table or index is interleaved, `encoding.interleavedSentinel` is used 202 // in place of the family id (a varint) to signal the next component of the 203 // key. An example of one level of interleaving (a parent): 204 // /<parent_table_id>/<parent_index_id>/<field_1>/<field_2>/NullDesc/<table_id>/<index_id>/<field_3>/<family> 205 func MakeSpanFromEncDatums( 206 values EncDatumRow, 207 types []*types.T, 208 dirs []IndexDescriptor_Direction, 209 tableDesc *TableDescriptor, 210 index *IndexDescriptor, 211 alloc *DatumAlloc, 212 keyPrefix []byte, 213 ) (_ roachpb.Span, containsNull bool, _ error) { 214 startKey, complete, containsNull, err := makeKeyFromEncDatums(values, types, dirs, tableDesc, index, alloc, keyPrefix) 215 if err != nil { 216 return roachpb.Span{}, false, err 217 } 218 219 var endKey roachpb.Key 220 if complete && index.Unique { 221 // If all values in the input index were specified and the input index is 222 // unique, indicating that it might have child interleaves, append an 223 // interleave marker instead of PrefixEnding the key, to avoid including 224 // any child interleaves of the input key. 225 // 226 // Note that currently only primary indexes can contain interleaved 227 // tables or indexes, so this condition is broader than necessary in 228 // case one day we permit interleaving into arbitrary unique indexes. 229 // Note also that we could precisely only emit an interleaved sentinel 230 // if this index does in fact have interleaves - we choose not to do 231 // that to make testing simpler and traces and spans more consistent. 232 endKey = encoding.EncodeInterleavedSentinel(startKey) 233 } else { 234 endKey = startKey.PrefixEnd() 235 } 236 return roachpb.Span{Key: startKey, EndKey: endKey}, containsNull, nil 237 } 238 239 // NeededColumnFamilyIDs returns the minimal set of column families required to 240 // retrieve neededCols for the specified table and index. The returned FamilyIDs 241 // are in sorted order. 242 func NeededColumnFamilyIDs( 243 neededCols util.FastIntSet, table *TableDescriptor, index *IndexDescriptor, 244 ) []FamilyID { 245 if len(table.Families) == 1 { 246 return []FamilyID{table.Families[0].ID} 247 } 248 249 // Build some necessary data structures for column metadata. 250 columns := table.ColumnsWithMutations(true) 251 colIdxMap := table.ColumnIdxMapWithMutations(true) 252 var indexedCols util.FastIntSet 253 var compositeCols util.FastIntSet 254 var extraCols util.FastIntSet 255 for _, columnID := range index.ColumnIDs { 256 columnOrdinal := colIdxMap[columnID] 257 indexedCols.Add(columnOrdinal) 258 } 259 for _, columnID := range index.CompositeColumnIDs { 260 columnOrdinal := colIdxMap[columnID] 261 compositeCols.Add(columnOrdinal) 262 } 263 for _, columnID := range index.ExtraColumnIDs { 264 columnOrdinal := colIdxMap[columnID] 265 extraCols.Add(columnOrdinal) 266 } 267 268 // The column family with ID 0 is special because it always has a KV entry. 269 // Other column families will omit a value if all their columns are null, so 270 // we may need to retrieve family 0 to use as a sentinel for distinguishing 271 // between null values and the absence of a row. Also, secondary indexes store 272 // values here for composite and "extra" columns. ("Extra" means primary key 273 // columns which are not indexed.) 274 var family0 *ColumnFamilyDescriptor 275 hasSecondaryEncoding := index.GetEncodingType(table.PrimaryIndex.ID) == SecondaryIndexEncoding 276 277 // First iterate over the needed columns and look for a few special cases: 278 // columns which can be decoded from the key and columns whose value is stored 279 // in family 0. 280 family0Needed := false 281 nc := neededCols.Copy() 282 neededCols.ForEach(func(columnOrdinal int) { 283 if indexedCols.Contains(columnOrdinal) && !compositeCols.Contains(columnOrdinal) { 284 // We can decode this column from the index key, so no particular family 285 // is needed. 286 nc.Remove(columnOrdinal) 287 } 288 if hasSecondaryEncoding && (compositeCols.Contains(columnOrdinal) || 289 extraCols.Contains(columnOrdinal)) { 290 // Secondary indexes store composite and "extra" column values in family 291 // 0. 292 family0Needed = true 293 nc.Remove(columnOrdinal) 294 } 295 }) 296 297 // Iterate over the column families to find which ones contain needed columns. 298 // We also keep track of whether all of the needed families' columns are 299 // nullable, since this means we need column family 0 as a sentinel, even if 300 // none of its columns are needed. 301 var neededFamilyIDs []FamilyID 302 allFamiliesNullable := true 303 for i := range table.Families { 304 family := &table.Families[i] 305 needed := false 306 nullable := true 307 if family.ID == 0 { 308 // Set column family 0 aside in case we need it as a sentinel. 309 family0 = family 310 if family0Needed { 311 needed = true 312 } 313 nullable = false 314 } 315 for _, columnID := range family.ColumnIDs { 316 if needed && !nullable { 317 // Nothing left to check. 318 break 319 } 320 columnOrdinal := colIdxMap[columnID] 321 if nc.Contains(columnOrdinal) { 322 needed = true 323 } 324 if !columns[columnOrdinal].Nullable && (!indexedCols.Contains(columnOrdinal) || 325 compositeCols.Contains(columnOrdinal) && !hasSecondaryEncoding) { 326 // The column is non-nullable and cannot be decoded from a different 327 // family, so this column family must have a KV entry for every row. 328 nullable = false 329 } 330 } 331 if needed { 332 neededFamilyIDs = append(neededFamilyIDs, family.ID) 333 if !nullable { 334 allFamiliesNullable = false 335 } 336 } 337 } 338 if family0 == nil { 339 panic("column family 0 not found") 340 } 341 342 // If all the needed families are nullable, we also need family 0 as a 343 // sentinel. Note that this is only the case if family 0 was not already added 344 // to neededFamilyIDs. 345 if allFamiliesNullable { 346 // Prepend family 0. 347 neededFamilyIDs = append(neededFamilyIDs, 0) 348 copy(neededFamilyIDs[1:], neededFamilyIDs) 349 neededFamilyIDs[0] = family0.ID 350 } 351 352 return neededFamilyIDs 353 } 354 355 // SplitSpanIntoSeparateFamilies splits a span representing a single row point 356 // lookup into separate disjoint spans that request only the particular column 357 // families from neededFamilies instead of requesting all the families. It is up 358 // to the client to ensure the requested span represents a single row lookup and 359 // that the span splitting is appropriate (see CanSplitSpanIntoSeparateFamilies). 360 // 361 // The function accepts a slice of spans to append to. 362 func SplitSpanIntoSeparateFamilies( 363 appendTo roachpb.Spans, span roachpb.Span, neededFamilies []FamilyID, 364 ) roachpb.Spans { 365 span.Key = span.Key[:len(span.Key):len(span.Key)] // avoid mutation and aliasing 366 for i, familyID := range neededFamilies { 367 var famSpan roachpb.Span 368 famSpan.Key = keys.MakeFamilyKey(span.Key, uint32(familyID)) 369 famSpan.EndKey = famSpan.Key.PrefixEnd() 370 if i > 0 && familyID == neededFamilies[i-1]+1 { 371 // This column family is adjacent to the previous one. We can merge 372 // the two spans into one. 373 appendTo[len(appendTo)-1].EndKey = famSpan.EndKey 374 } else { 375 appendTo = append(appendTo, famSpan) 376 } 377 } 378 return appendTo 379 } 380 381 // makeKeyFromEncDatums creates an index key by concatenating keyPrefix with the 382 // encodings of the given EncDatum values. The values, types, and dirs 383 // parameters should be specified in the same order as the index key columns and 384 // may be a prefix. The complete return value is true if the resultant key 385 // fully constrains the index. 386 // 387 // If a table or index is interleaved, `encoding.interleavedSentinel` is used 388 // in place of the family id (a varint) to signal the next component of the 389 // key. An example of one level of interleaving (a parent): 390 // /<parent_table_id>/<parent_index_id>/<field_1>/<field_2>/NullDesc/<table_id>/<index_id>/<field_3>/<family> 391 func makeKeyFromEncDatums( 392 values EncDatumRow, 393 types []*types.T, 394 dirs []IndexDescriptor_Direction, 395 tableDesc *TableDescriptor, 396 index *IndexDescriptor, 397 alloc *DatumAlloc, 398 keyPrefix []byte, 399 ) (_ roachpb.Key, complete bool, containsNull bool, _ error) { 400 // Values may be a prefix of the index columns. 401 if len(values) > len(dirs) { 402 return nil, false, false, errors.Errorf("%d values, %d directions", len(values), len(dirs)) 403 } 404 if len(values) != len(types) { 405 return nil, false, false, errors.Errorf("%d values, %d types", len(values), len(types)) 406 } 407 // We know we will append to the key which will cause the capacity to grow 408 // so make it bigger from the get-go. 409 key := make(roachpb.Key, len(keyPrefix), len(keyPrefix)*2) 410 copy(key, keyPrefix) 411 412 if len(index.Interleave.Ancestors) > 0 { 413 for i, ancestor := range index.Interleave.Ancestors { 414 // The first ancestor is assumed to already be encoded in keyPrefix. 415 if i != 0 { 416 key = EncodePartialTableIDIndexID(key, ancestor.TableID, ancestor.IndexID) 417 } 418 419 partial := false 420 length := int(ancestor.SharedPrefixLen) 421 if length > len(types) { 422 length = len(types) 423 partial = true 424 } 425 var ( 426 err error 427 n bool 428 ) 429 key, n, err = appendEncDatumsToKey(key, types[:length], values[:length], dirs[:length], alloc) 430 if err != nil { 431 return nil, false, false, err 432 } 433 containsNull = containsNull || n 434 if partial { 435 // Early stop - the number of desired columns was fewer than the number 436 // left in the current interleave. 437 return key, false, false, nil 438 } 439 types, values, dirs = types[length:], values[length:], dirs[length:] 440 441 // Each ancestor is separated by an interleaved 442 // sentinel (0xfe). 443 key = encoding.EncodeInterleavedSentinel(key) 444 } 445 446 key = EncodePartialTableIDIndexID(key, tableDesc.ID, index.ID) 447 } 448 var ( 449 err error 450 n bool 451 ) 452 key, n, err = appendEncDatumsToKey(key, types, values, dirs, alloc) 453 if err != nil { 454 return key, false, false, err 455 } 456 containsNull = containsNull || n 457 return key, len(types) == len(index.ColumnIDs), containsNull, err 458 } 459 460 // findColumnValue returns the value corresponding to the column. If 461 // the column isn't present return a NULL value. 462 func findColumnValue(column ColumnID, colMap map[ColumnID]int, values []tree.Datum) tree.Datum { 463 if i, ok := colMap[column]; ok { 464 // TODO(pmattis): Need to convert the values[i] value to the type 465 // expected by the column. 466 return values[i] 467 } 468 return tree.DNull 469 } 470 471 // appendEncDatumsToKey concatenates the encoded representations of 472 // the datums at the end of the given roachpb.Key. 473 func appendEncDatumsToKey( 474 key roachpb.Key, 475 types []*types.T, 476 values EncDatumRow, 477 dirs []IndexDescriptor_Direction, 478 alloc *DatumAlloc, 479 ) (_ roachpb.Key, containsNull bool, _ error) { 480 for i, val := range values { 481 encoding := DatumEncoding_ASCENDING_KEY 482 if dirs[i] == IndexDescriptor_DESC { 483 encoding = DatumEncoding_DESCENDING_KEY 484 } 485 if val.IsNull() { 486 containsNull = true 487 } 488 var err error 489 key, err = val.Encode(types[i], alloc, encoding, key) 490 if err != nil { 491 return nil, false, err 492 } 493 } 494 return key, containsNull, nil 495 } 496 497 // EncodePartialTableIDIndexID encodes a table id followed by an index id to an 498 // existing key. The key must already contain a tenant id. 499 func EncodePartialTableIDIndexID(key []byte, tableID ID, indexID IndexID) []byte { 500 return keys.MakeTableIDIndexID(key, uint32(tableID), uint32(indexID)) 501 } 502 503 // DecodePartialTableIDIndexID decodes a table id followed by an index id. The 504 // input key must already have its tenant id removed. 505 func DecodePartialTableIDIndexID(key []byte) ([]byte, ID, IndexID, error) { 506 key, tableID, indexID, err := keys.DecodeTableIDIndexID(key) 507 return key, ID(tableID), IndexID(indexID), err 508 } 509 510 // DecodeIndexKeyPrefix decodes the prefix of an index key and returns the 511 // index id and a slice for the rest of the key. 512 // 513 // Don't use this function in the scan "hot path". 514 func DecodeIndexKeyPrefix( 515 codec keys.SQLCodec, desc *TableDescriptor, key []byte, 516 ) (indexID IndexID, remaining []byte, err error) { 517 key, err = codec.StripTenantPrefix(key) 518 if err != nil { 519 return 0, nil, err 520 } 521 522 // TODO(dan): This whole operation is n^2 because of the interleaves 523 // bookkeeping. We could improve it to n with a prefix tree of components. 524 525 interleaves := append([]IndexDescriptor{desc.PrimaryIndex}, desc.Indexes...) 526 527 for component := 0; ; component++ { 528 var tableID ID 529 key, tableID, indexID, err = DecodePartialTableIDIndexID(key) 530 if err != nil { 531 return 0, nil, err 532 } 533 if tableID == desc.ID { 534 // Once desc's table id has been decoded, there can be no more 535 // interleaves. 536 break 537 } 538 539 for i := len(interleaves) - 1; i >= 0; i-- { 540 if len(interleaves[i].Interleave.Ancestors) <= component || 541 interleaves[i].Interleave.Ancestors[component].TableID != tableID || 542 interleaves[i].Interleave.Ancestors[component].IndexID != indexID { 543 544 // This component, and thus this interleave, doesn't match what was 545 // decoded, remove it. 546 copy(interleaves[i:], interleaves[i+1:]) 547 interleaves = interleaves[:len(interleaves)-1] 548 } 549 } 550 // The decoded key doesn't many any known interleaves 551 if len(interleaves) == 0 { 552 return 0, nil, errors.Errorf("no known interleaves for key") 553 } 554 555 // Anything left has the same SharedPrefixLen at index `component`, so just 556 // use the first one. 557 for i := uint32(0); i < interleaves[0].Interleave.Ancestors[component].SharedPrefixLen; i++ { 558 l, err := encoding.PeekLength(key) 559 if err != nil { 560 return 0, nil, err 561 } 562 key = key[l:] 563 } 564 565 // Consume the interleaved sentinel. 566 var ok bool 567 key, ok = encoding.DecodeIfInterleavedSentinel(key) 568 if !ok { 569 return 0, nil, errors.Errorf("invalid interleave key") 570 } 571 } 572 573 return indexID, key, err 574 } 575 576 // DecodeIndexKey decodes the values that are a part of the specified index 577 // key (setting vals). 578 // 579 // The remaining bytes in the index key are returned which will either be an 580 // encoded column ID for the primary key index, the primary key suffix for 581 // non-unique secondary indexes or unique secondary indexes containing NULL or 582 // empty. If the given descriptor does not match the key, false is returned with 583 // no error. 584 func DecodeIndexKey( 585 codec keys.SQLCodec, 586 desc *TableDescriptor, 587 index *IndexDescriptor, 588 types []*types.T, 589 vals []EncDatum, 590 colDirs []IndexDescriptor_Direction, 591 key []byte, 592 ) (remainingKey []byte, matches bool, foundNull bool, _ error) { 593 key, err := codec.StripTenantPrefix(key) 594 if err != nil { 595 return nil, false, false, err 596 } 597 key, _, _, err = DecodePartialTableIDIndexID(key) 598 if err != nil { 599 return nil, false, false, err 600 } 601 return DecodeIndexKeyWithoutTableIDIndexIDPrefix(desc, index, types, vals, colDirs, key) 602 } 603 604 // DecodeIndexKeyWithoutTableIDIndexIDPrefix is the same as DecodeIndexKey, 605 // except it expects its index key is missing in its tenant id and first table 606 // id / index id key prefix. 607 func DecodeIndexKeyWithoutTableIDIndexIDPrefix( 608 desc *TableDescriptor, 609 index *IndexDescriptor, 610 types []*types.T, 611 vals []EncDatum, 612 colDirs []IndexDescriptor_Direction, 613 key []byte, 614 ) (remainingKey []byte, matches bool, foundNull bool, _ error) { 615 var decodedTableID ID 616 var decodedIndexID IndexID 617 var err error 618 619 if len(index.Interleave.Ancestors) > 0 { 620 for i, ancestor := range index.Interleave.Ancestors { 621 // Our input key had its first table id / index id chopped off, so 622 // don't try to decode those for the first ancestor. 623 if i != 0 { 624 key, decodedTableID, decodedIndexID, err = DecodePartialTableIDIndexID(key) 625 if err != nil { 626 return nil, false, false, err 627 } 628 if decodedTableID != ancestor.TableID || decodedIndexID != ancestor.IndexID { 629 return nil, false, false, nil 630 } 631 } 632 633 length := int(ancestor.SharedPrefixLen) 634 var isNull bool 635 key, isNull, err = DecodeKeyVals(types[:length], vals[:length], colDirs[:length], key) 636 if err != nil { 637 return nil, false, false, err 638 } 639 types, vals, colDirs = types[length:], vals[length:], colDirs[length:] 640 foundNull = foundNull || isNull 641 642 // Consume the interleaved sentinel. 643 var ok bool 644 key, ok = encoding.DecodeIfInterleavedSentinel(key) 645 if !ok { 646 return nil, false, false, nil 647 } 648 } 649 650 key, decodedTableID, decodedIndexID, err = DecodePartialTableIDIndexID(key) 651 if err != nil { 652 return nil, false, false, err 653 } 654 if decodedTableID != desc.ID || decodedIndexID != index.ID { 655 return nil, false, false, nil 656 } 657 } 658 659 var isNull bool 660 key, isNull, err = DecodeKeyVals(types, vals, colDirs, key) 661 if err != nil { 662 return nil, false, false, err 663 } 664 foundNull = foundNull || isNull 665 666 // We're expecting a column family id next (a varint). If 667 // interleavedSentinel is actually next, then this key is for a child 668 // table. 669 if _, ok := encoding.DecodeIfInterleavedSentinel(key); ok { 670 return nil, false, false, nil 671 } 672 673 return key, true, foundNull, nil 674 } 675 676 // DecodeKeyVals decodes the values that are part of the key. The decoded 677 // values are stored in the vals. If this slice is nil, the direction 678 // used will default to encoding.Ascending. 679 // DecodeKeyVals returns whether or not NULL was encountered in the key. 680 func DecodeKeyVals( 681 types []*types.T, vals []EncDatum, directions []IndexDescriptor_Direction, key []byte, 682 ) ([]byte, bool, error) { 683 if directions != nil && len(directions) != len(vals) { 684 return nil, false, errors.Errorf("encoding directions doesn't parallel vals: %d vs %d.", 685 len(directions), len(vals)) 686 } 687 foundNull := false 688 for j := range vals { 689 enc := DatumEncoding_ASCENDING_KEY 690 if directions != nil && (directions[j] == IndexDescriptor_DESC) { 691 enc = DatumEncoding_DESCENDING_KEY 692 } 693 var err error 694 vals[j], key, err = EncDatumFromBuffer(types[j], enc, key) 695 if err != nil { 696 return nil, false, err 697 } 698 if vals[j].IsNull() { 699 foundNull = true 700 } 701 } 702 return key, foundNull, nil 703 } 704 705 // ExtractIndexKey constructs the index (primary) key for a row from any index 706 // key/value entry, including secondary indexes. 707 // 708 // Don't use this function in the scan "hot path". 709 func ExtractIndexKey( 710 a *DatumAlloc, codec keys.SQLCodec, tableDesc *TableDescriptor, entry kv.KeyValue, 711 ) (roachpb.Key, error) { 712 indexID, key, err := DecodeIndexKeyPrefix(codec, tableDesc, entry.Key) 713 if err != nil { 714 return nil, err 715 } 716 if indexID == tableDesc.PrimaryIndex.ID { 717 return entry.Key, nil 718 } 719 720 index, err := tableDesc.FindIndexByID(indexID) 721 if err != nil { 722 return nil, err 723 } 724 725 // Extract the values for index.ColumnIDs. 726 indexTypes, err := GetColumnTypes(tableDesc, index.ColumnIDs) 727 if err != nil { 728 return nil, err 729 } 730 values := make([]EncDatum, len(index.ColumnIDs)) 731 dirs := index.ColumnDirections 732 if len(index.Interleave.Ancestors) > 0 { 733 // TODO(dan): In the interleaved index case, we parse the key twice; once to 734 // find the index id so we can look up the descriptor, and once to extract 735 // the values. Only parse once. 736 var ok bool 737 _, ok, _, err = DecodeIndexKey(codec, tableDesc, index, indexTypes, values, dirs, entry.Key) 738 if err != nil { 739 return nil, err 740 } 741 if !ok { 742 return nil, errors.Errorf("descriptor did not match key") 743 } 744 } else { 745 key, _, err = DecodeKeyVals(indexTypes, values, dirs, key) 746 if err != nil { 747 return nil, err 748 } 749 } 750 751 // Extract the values for index.ExtraColumnIDs 752 extraTypes, err := GetColumnTypes(tableDesc, index.ExtraColumnIDs) 753 if err != nil { 754 return nil, err 755 } 756 extraValues := make([]EncDatum, len(index.ExtraColumnIDs)) 757 dirs = make([]IndexDescriptor_Direction, len(index.ExtraColumnIDs)) 758 for i := range index.ExtraColumnIDs { 759 // Implicit columns are always encoded Ascending. 760 dirs[i] = IndexDescriptor_ASC 761 } 762 extraKey := key 763 if index.Unique { 764 extraKey, err = entry.Value.GetBytes() 765 if err != nil { 766 return nil, err 767 } 768 } 769 _, _, err = DecodeKeyVals(extraTypes, extraValues, dirs, extraKey) 770 if err != nil { 771 return nil, err 772 } 773 774 // Encode the index key from its components. 775 colMap := make(map[ColumnID]int) 776 for i, columnID := range index.ColumnIDs { 777 colMap[columnID] = i 778 } 779 for i, columnID := range index.ExtraColumnIDs { 780 colMap[columnID] = i + len(index.ColumnIDs) 781 } 782 indexKeyPrefix := MakeIndexKeyPrefix(codec, tableDesc, tableDesc.PrimaryIndex.ID) 783 784 decodedValues := make([]tree.Datum, len(values)+len(extraValues)) 785 for i, value := range values { 786 err := value.EnsureDecoded(indexTypes[i], a) 787 if err != nil { 788 return nil, err 789 } 790 decodedValues[i] = value.Datum 791 } 792 for i, value := range extraValues { 793 err := value.EnsureDecoded(extraTypes[i], a) 794 if err != nil { 795 return nil, err 796 } 797 decodedValues[len(values)+i] = value.Datum 798 } 799 indexKey, _, err := EncodeIndexKey( 800 tableDesc, &tableDesc.PrimaryIndex, colMap, decodedValues, indexKeyPrefix) 801 return indexKey, err 802 } 803 804 // IndexEntry represents an encoded key/value for an index entry. 805 type IndexEntry struct { 806 Key roachpb.Key 807 Value roachpb.Value 808 // Only used for forward indexes. 809 Family FamilyID 810 } 811 812 // valueEncodedColumn represents a composite or stored column of a secondary 813 // index. 814 type valueEncodedColumn struct { 815 id ColumnID 816 isComposite bool 817 } 818 819 // byID implements sort.Interface for []valueEncodedColumn based on the id 820 // field. 821 type byID []valueEncodedColumn 822 823 func (a byID) Len() int { return len(a) } 824 func (a byID) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 825 func (a byID) Less(i, j int) bool { return a[i].id < a[j].id } 826 827 // EncodeInvertedIndexKeys creates a list of inverted index keys by 828 // concatenating keyPrefix with the encodings of the column in the 829 // index. 830 func EncodeInvertedIndexKeys( 831 tableDesc *TableDescriptor, 832 index *IndexDescriptor, 833 colMap map[ColumnID]int, 834 values []tree.Datum, 835 keyPrefix []byte, 836 ) (key [][]byte, err error) { 837 if len(index.ColumnIDs) > 1 { 838 return nil, errors.AssertionFailedf("trying to apply inverted index to more than one column") 839 } 840 841 var val tree.Datum 842 if i, ok := colMap[index.ColumnIDs[0]]; ok { 843 val = values[i] 844 } else { 845 val = tree.DNull 846 } 847 if !geoindex.IsEmptyConfig(&index.GeoConfig) { 848 return EncodeGeoInvertedIndexTableKeys(val, keyPrefix, index) 849 } 850 return EncodeInvertedIndexTableKeys(val, keyPrefix) 851 } 852 853 // EncodeInvertedIndexTableKeys produces one inverted index key per element in 854 // the input datum, which should be a container (either JSON or Array). For 855 // JSON, "element" means unique path through the document. Each output key is 856 // prefixed by inKey, and is guaranteed to be lexicographically sortable, but 857 // not guaranteed to be round-trippable during decoding. If the input Datum 858 // is (SQL) NULL, no inverted index keys will be produced, because inverted 859 // indexes cannot and do not need to satisfy the predicate col IS NULL. 860 func EncodeInvertedIndexTableKeys(val tree.Datum, inKey []byte) (key [][]byte, err error) { 861 if val == tree.DNull { 862 return nil, nil 863 } 864 datum := tree.UnwrapDatum(nil, val) 865 switch val.ResolvedType().Family() { 866 case types.JsonFamily: 867 return json.EncodeInvertedIndexKeys(inKey, val.(*tree.DJSON).JSON) 868 case types.ArrayFamily: 869 return encodeArrayInvertedIndexTableKeys(val.(*tree.DArray), inKey) 870 } 871 return nil, errors.AssertionFailedf("trying to apply inverted index to unsupported type %s", datum.ResolvedType()) 872 } 873 874 // encodeArrayInvertedIndexTableKeys returns a list of inverted index keys for 875 // the given input array, one per entry in the array. The input inKey is 876 // prefixed to all returned keys. 877 // N.B.: This won't return any keys for 878 func encodeArrayInvertedIndexTableKeys(val *tree.DArray, inKey []byte) (key [][]byte, err error) { 879 outKeys := make([][]byte, 0, len(val.Array)) 880 for i := range val.Array { 881 d := val.Array[i] 882 if d == tree.DNull { 883 // We don't need to make keys for NULL, since in SQL: 884 // SELECT ARRAY[1, NULL, 2] @> ARRAY[NULL] 885 // returns false. 886 continue 887 } 888 outKey := make([]byte, len(inKey)) 889 copy(outKey, inKey) 890 newKey, err := EncodeTableKey(outKey, d, encoding.Ascending) 891 if err != nil { 892 return nil, err 893 } 894 outKeys = append(outKeys, newKey) 895 } 896 outKeys = unique.UniquifyByteSlices(outKeys) 897 return outKeys, nil 898 } 899 900 // EncodeGeoInvertedIndexTableKeys is the equivalent of EncodeInvertedIndexTableKeys 901 // for Geography and Geometry. 902 func EncodeGeoInvertedIndexTableKeys( 903 val tree.Datum, inKey []byte, index *IndexDescriptor, 904 ) (key [][]byte, err error) { 905 if val == tree.DNull { 906 return nil, nil 907 } 908 switch val.ResolvedType().Family() { 909 case types.GeographyFamily: 910 index := geoindex.NewS2GeographyIndex(*index.GeoConfig.S2Geography) 911 intKeys, err := index.InvertedIndexKeys(context.TODO(), val.(*tree.DGeography).Geography) 912 if err != nil { 913 return nil, err 914 } 915 return encodeGeoKeys(inKey, intKeys) 916 case types.GeometryFamily: 917 index := geoindex.NewS2GeometryIndex(*index.GeoConfig.S2Geometry) 918 intKeys, err := index.InvertedIndexKeys(context.TODO(), val.(*tree.DGeometry).Geometry) 919 if err != nil { 920 return nil, err 921 } 922 return encodeGeoKeys(inKey, intKeys) 923 default: 924 return nil, errors.Errorf("internal error: unexpected type: %s", val.ResolvedType().Family()) 925 } 926 } 927 928 func encodeGeoKeys(inKey []byte, geoKeys []geoindex.Key) (keys [][]byte, err error) { 929 keys = make([][]byte, 0, len(geoKeys)) 930 for _, k := range geoKeys { 931 outKey := make([]byte, len(inKey)) 932 copy(outKey, inKey) 933 d := (tree.DInt)(k) 934 newKey, err := EncodeTableKey(outKey, &d, encoding.Ascending) 935 if err != nil { 936 return nil, err 937 } 938 keys = append(keys, newKey) 939 } 940 return keys, nil 941 } 942 943 // EncodePrimaryIndex constructs a list of k/v pairs for a 944 // row encoded as a primary index. This function mirrors the encoding 945 // logic in prepareInsertOrUpdateBatch in pkg/sql/row/writer.go. 946 // It is somewhat duplicated here due to the different arguments 947 // that prepareOrInsertUpdateBatch needs and uses to generate 948 // the k/v's for the row it inserts. includeEmpty controls 949 // whether or not k/v's with empty values should be returned. 950 // It returns indexEntries in family sorted order. 951 func EncodePrimaryIndex( 952 codec keys.SQLCodec, 953 tableDesc *TableDescriptor, 954 index *IndexDescriptor, 955 colMap map[ColumnID]int, 956 values []tree.Datum, 957 includeEmpty bool, 958 ) ([]IndexEntry, error) { 959 keyPrefix := MakeIndexKeyPrefix(codec, tableDesc, index.ID) 960 indexKey, _, err := EncodeIndexKey(tableDesc, index, colMap, values, keyPrefix) 961 if err != nil { 962 return nil, err 963 } 964 // This information should be precomputed on the table descriptor. 965 indexedColumns := map[ColumnID]struct{}{} 966 for _, colID := range index.ColumnIDs { 967 indexedColumns[colID] = struct{}{} 968 } 969 var entryValue []byte 970 indexEntries := make([]IndexEntry, 0, len(tableDesc.Families)) 971 var columnsToEncode []valueEncodedColumn 972 973 for i := range tableDesc.Families { 974 var err error 975 family := &tableDesc.Families[i] 976 if i > 0 { 977 indexKey = indexKey[:len(indexKey):len(indexKey)] 978 entryValue = entryValue[:0] 979 columnsToEncode = columnsToEncode[:0] 980 } 981 familyKey := keys.MakeFamilyKey(indexKey, uint32(family.ID)) 982 // The decoders expect that column family 0 is encoded with a TUPLE value tag, so we 983 // don't want to use the untagged value encoding. 984 if len(family.ColumnIDs) == 1 && family.ColumnIDs[0] == family.DefaultColumnID && family.ID != 0 { 985 datum := findColumnValue(family.DefaultColumnID, colMap, values) 986 // We want to include this column if its value is non-null or 987 // we were requested to include all of the columns. 988 if datum != tree.DNull || includeEmpty { 989 col, err := tableDesc.FindColumnByID(family.DefaultColumnID) 990 if err != nil { 991 return nil, err 992 } 993 value, err := MarshalColumnValue(col, datum) 994 if err != nil { 995 return nil, err 996 } 997 indexEntries = append(indexEntries, IndexEntry{Key: familyKey, Value: value, Family: family.ID}) 998 } 999 continue 1000 } 1001 1002 for _, colID := range family.ColumnIDs { 1003 if _, ok := indexedColumns[colID]; !ok { 1004 columnsToEncode = append(columnsToEncode, valueEncodedColumn{id: colID}) 1005 continue 1006 } 1007 if cdatum, ok := values[colMap[colID]].(tree.CompositeDatum); ok { 1008 if cdatum.IsComposite() { 1009 columnsToEncode = append(columnsToEncode, valueEncodedColumn{id: colID, isComposite: true}) 1010 continue 1011 } 1012 } 1013 } 1014 sort.Sort(byID(columnsToEncode)) 1015 entryValue, err = writeColumnValues(entryValue, colMap, values, columnsToEncode) 1016 if err != nil { 1017 return nil, err 1018 } 1019 if family.ID != 0 && len(entryValue) == 0 && !includeEmpty { 1020 continue 1021 } 1022 entry := IndexEntry{Key: familyKey, Family: family.ID} 1023 entry.Value.SetTuple(entryValue) 1024 indexEntries = append(indexEntries, entry) 1025 } 1026 return indexEntries, nil 1027 } 1028 1029 // EncodeSecondaryIndex encodes key/values for a secondary 1030 // index. colMap maps ColumnIDs to indices in `values`. This returns a 1031 // slice of IndexEntry. includeEmpty controls whether or not 1032 // EncodeSecondaryIndex should return k/v's that contain 1033 // empty values. For forward indexes the returned list of 1034 // index entries is in family sorted order. 1035 func EncodeSecondaryIndex( 1036 codec keys.SQLCodec, 1037 tableDesc *TableDescriptor, 1038 secondaryIndex *IndexDescriptor, 1039 colMap map[ColumnID]int, 1040 values []tree.Datum, 1041 includeEmpty bool, 1042 ) ([]IndexEntry, error) { 1043 secondaryIndexKeyPrefix := MakeIndexKeyPrefix(codec, tableDesc, secondaryIndex.ID) 1044 1045 // Use the primary key encoding for covering indexes. 1046 if secondaryIndex.GetEncodingType(tableDesc.PrimaryIndex.ID) == PrimaryIndexEncoding { 1047 return EncodePrimaryIndex(codec, tableDesc, secondaryIndex, colMap, values, includeEmpty) 1048 } 1049 1050 var containsNull = false 1051 var secondaryKeys [][]byte 1052 var err error 1053 if secondaryIndex.Type == IndexDescriptor_INVERTED { 1054 secondaryKeys, err = EncodeInvertedIndexKeys(tableDesc, secondaryIndex, colMap, values, secondaryIndexKeyPrefix) 1055 } else { 1056 var secondaryIndexKey []byte 1057 secondaryIndexKey, containsNull, err = EncodeIndexKey( 1058 tableDesc, secondaryIndex, colMap, values, secondaryIndexKeyPrefix) 1059 1060 secondaryKeys = [][]byte{secondaryIndexKey} 1061 } 1062 if err != nil { 1063 return []IndexEntry{}, err 1064 } 1065 1066 // Add the extra columns - they are encoded in ascending order which is done 1067 // by passing nil for the encoding directions. 1068 extraKey, _, err := EncodeColumns(secondaryIndex.ExtraColumnIDs, nil, 1069 colMap, values, nil) 1070 if err != nil { 1071 return []IndexEntry{}, err 1072 } 1073 1074 // entries is the resulting array that we will return. We allocate upfront at least 1075 // len(secondaryKeys) positions to avoid allocations from appending. 1076 entries := make([]IndexEntry, 0, len(secondaryKeys)) 1077 for _, key := range secondaryKeys { 1078 if !secondaryIndex.Unique || containsNull { 1079 // If the index is not unique or it contains a NULL value, append 1080 // extraKey to the key in order to make it unique. 1081 key = append(key, extraKey...) 1082 } 1083 1084 if len(tableDesc.Families) == 1 || 1085 secondaryIndex.Type == IndexDescriptor_INVERTED || 1086 secondaryIndex.Version == BaseIndexFormatVersion { 1087 // We do all computation that affects indexes with families in a separate code path to avoid performance 1088 // regression for tables without column families. 1089 entry, err := encodeSecondaryIndexNoFamilies(secondaryIndex, colMap, key, values, extraKey) 1090 if err != nil { 1091 return []IndexEntry{}, err 1092 } 1093 entries = append(entries, entry) 1094 } else { 1095 // This is only executed once as len(secondaryKeys) = 1 for non inverted secondary indexes. 1096 // Create a mapping of family ID to stored columns. 1097 // TODO (rohany): we want to share this information across calls to EncodeSecondaryIndex -- 1098 // its not easy to do this right now. It would be nice if the index descriptor or table descriptor 1099 // had this information computed/cached for us. 1100 familyToColumns := make(map[FamilyID][]valueEncodedColumn) 1101 addToFamilyColMap := func(id FamilyID, column valueEncodedColumn) { 1102 if _, ok := familyToColumns[id]; !ok { 1103 familyToColumns[id] = []valueEncodedColumn{} 1104 } 1105 familyToColumns[id] = append(familyToColumns[id], column) 1106 } 1107 // Ensure that column family 0 always generates a k/v pair. 1108 familyToColumns[0] = []valueEncodedColumn{} 1109 // All composite columns are stored in family 0. 1110 for _, id := range secondaryIndex.CompositeColumnIDs { 1111 addToFamilyColMap(0, valueEncodedColumn{id: id, isComposite: true}) 1112 } 1113 for _, family := range tableDesc.Families { 1114 for _, id := range secondaryIndex.StoreColumnIDs { 1115 for _, col := range family.ColumnIDs { 1116 if id == col { 1117 addToFamilyColMap(family.ID, valueEncodedColumn{id: id, isComposite: false}) 1118 } 1119 } 1120 } 1121 } 1122 entries, err = encodeSecondaryIndexWithFamilies( 1123 familyToColumns, secondaryIndex, colMap, key, values, extraKey, entries, includeEmpty) 1124 if err != nil { 1125 return []IndexEntry{}, err 1126 } 1127 } 1128 } 1129 return entries, nil 1130 } 1131 1132 // encodeSecondaryIndexWithFamilies generates a k/v pair for 1133 // each family/column pair in familyMap. The row parameter will be 1134 // modified by the function, so copy it before using. includeEmpty 1135 // controls whether or not k/v's with empty values will be returned. 1136 // The returned indexEntries are in family sorted order. 1137 func encodeSecondaryIndexWithFamilies( 1138 familyMap map[FamilyID][]valueEncodedColumn, 1139 index *IndexDescriptor, 1140 colMap map[ColumnID]int, 1141 key []byte, 1142 row []tree.Datum, 1143 extraKeyCols []byte, 1144 results []IndexEntry, 1145 includeEmpty bool, 1146 ) ([]IndexEntry, error) { 1147 var ( 1148 value []byte 1149 err error 1150 ) 1151 origKeyLen := len(key) 1152 // TODO (rohany): is there a natural way of caching this information as well? 1153 // We have to iterate over the map in sorted family order. Other parts of the code 1154 // depend on a per-call consistent order of keys generated. 1155 familyIDs := make([]int, 0, len(familyMap)) 1156 for familyID := range familyMap { 1157 familyIDs = append(familyIDs, int(familyID)) 1158 } 1159 sort.Ints(familyIDs) 1160 for _, familyID := range familyIDs { 1161 storedColsInFam := familyMap[FamilyID(familyID)] 1162 // Ensure that future appends to key will cause a copy and not overwrite 1163 // existing key values. 1164 key = key[:origKeyLen:origKeyLen] 1165 1166 // If we aren't storing any columns in this family and we are not the first family, 1167 // skip onto the next family. We need to write family 0 no matter what to ensure 1168 // that each row has at least one entry in the DB. 1169 if len(storedColsInFam) == 0 && familyID != 0 { 1170 continue 1171 } 1172 1173 sort.Sort(byID(storedColsInFam)) 1174 1175 key = keys.MakeFamilyKey(key, uint32(familyID)) 1176 if index.Unique && familyID == 0 { 1177 // Note that a unique secondary index that contains a NULL column value 1178 // will have extraKey appended to the key and stored in the value. We 1179 // require extraKey to be appended to the key in order to make the key 1180 // unique. We could potentially get rid of the duplication here but at 1181 // the expense of complicating scanNode when dealing with unique 1182 // secondary indexes. 1183 value = extraKeyCols 1184 } else { 1185 // The zero value for an index-value is a 0-length bytes value. 1186 value = []byte{} 1187 } 1188 1189 value, err = writeColumnValues(value, colMap, row, storedColsInFam) 1190 if err != nil { 1191 return []IndexEntry{}, err 1192 } 1193 entry := IndexEntry{Key: key, Family: FamilyID(familyID)} 1194 // If we aren't looking at family 0 and don't have a value, 1195 // don't include an entry for this k/v. 1196 if familyID != 0 && len(value) == 0 && !includeEmpty { 1197 continue 1198 } 1199 // If we are looking at family 0, encode the data as BYTES, as it might 1200 // include encoded primary key columns. For other families, use the 1201 // tuple encoding for the value. 1202 if familyID == 0 { 1203 entry.Value.SetBytes(value) 1204 } else { 1205 entry.Value.SetTuple(value) 1206 } 1207 results = append(results, entry) 1208 } 1209 return results, nil 1210 } 1211 1212 // encodeSecondaryIndexNoFamilies takes a mostly constructed 1213 // secondary index key (without the family/sentinel at 1214 // the end), and appends the 0 family sentinel to it, and 1215 // constructs the value portion of the index. This function 1216 // performs the index encoding version before column 1217 // families were introduced onto secondary indexes. 1218 func encodeSecondaryIndexNoFamilies( 1219 index *IndexDescriptor, 1220 colMap map[ColumnID]int, 1221 key []byte, 1222 row []tree.Datum, 1223 extraKeyCols []byte, 1224 ) (IndexEntry, error) { 1225 var ( 1226 value []byte 1227 err error 1228 ) 1229 // If we aren't encoding index keys with families, all index keys use the sentinel family 0. 1230 key = keys.MakeFamilyKey(key, 0) 1231 if index.Unique { 1232 // Note that a unique secondary index that contains a NULL column value 1233 // will have extraKey appended to the key and stored in the value. We 1234 // require extraKey to be appended to the key in order to make the key 1235 // unique. We could potentially get rid of the duplication here but at 1236 // the expense of complicating scanNode when dealing with unique 1237 // secondary indexes. 1238 value = append(value, extraKeyCols...) 1239 } else { 1240 // The zero value for an index-value is a 0-length bytes value. 1241 value = []byte{} 1242 } 1243 var cols []valueEncodedColumn 1244 // Since we aren't encoding data with families, we just encode all stored and composite columns in the value. 1245 for _, id := range index.StoreColumnIDs { 1246 cols = append(cols, valueEncodedColumn{id: id, isComposite: false}) 1247 } 1248 for _, id := range index.CompositeColumnIDs { 1249 // Inverted indexes on a composite type (i.e. an array of composite types) 1250 // should not add the indexed column to the value. 1251 if index.Type == IndexDescriptor_INVERTED && id == index.ColumnIDs[0] { 1252 continue 1253 } 1254 cols = append(cols, valueEncodedColumn{id: id, isComposite: true}) 1255 } 1256 sort.Sort(byID(cols)) 1257 value, err = writeColumnValues(value, colMap, row, cols) 1258 if err != nil { 1259 return IndexEntry{}, err 1260 } 1261 entry := IndexEntry{Key: key, Family: 0} 1262 entry.Value.SetBytes(value) 1263 return entry, nil 1264 } 1265 1266 // writeColumnValues writes the value encoded versions of the desired columns from the input 1267 // row of datums into the value byte slice. 1268 func writeColumnValues( 1269 value []byte, colMap map[ColumnID]int, row []tree.Datum, columns []valueEncodedColumn, 1270 ) ([]byte, error) { 1271 var lastColID ColumnID 1272 for _, col := range columns { 1273 val := findColumnValue(col.id, colMap, row) 1274 if val == tree.DNull || (col.isComposite && !val.(tree.CompositeDatum).IsComposite()) { 1275 continue 1276 } 1277 if lastColID > col.id { 1278 panic(fmt.Errorf("cannot write column id %d after %d", col.id, lastColID)) 1279 } 1280 colIDDiff := col.id - lastColID 1281 lastColID = col.id 1282 var err error 1283 value, err = EncodeTableValue(value, colIDDiff, val, nil) 1284 if err != nil { 1285 return nil, err 1286 } 1287 } 1288 return value, nil 1289 } 1290 1291 // EncodeSecondaryIndexes encodes key/values for the secondary indexes. colMap 1292 // maps ColumnIDs to indices in `values`. secondaryIndexEntries is the return 1293 // value (passed as a parameter so the caller can reuse between rows) and is 1294 // expected to be the same length as indexes. 1295 func EncodeSecondaryIndexes( 1296 codec keys.SQLCodec, 1297 tableDesc *TableDescriptor, 1298 indexes []IndexDescriptor, 1299 colMap map[ColumnID]int, 1300 values []tree.Datum, 1301 secondaryIndexEntries []IndexEntry, 1302 includeEmpty bool, 1303 ) ([]IndexEntry, error) { 1304 if len(secondaryIndexEntries) > 0 { 1305 panic("Length of secondaryIndexEntries was non-zero") 1306 } 1307 for i := range indexes { 1308 entries, err := EncodeSecondaryIndex(codec, tableDesc, &indexes[i], colMap, values, includeEmpty) 1309 if err != nil { 1310 return secondaryIndexEntries, err 1311 } 1312 // Normally, each index will have exactly one entry. However, inverted 1313 // indexes can have 0 or >1 entries, as well as secondary indexes which 1314 // store columns from multiple column families. 1315 secondaryIndexEntries = append(secondaryIndexEntries, entries...) 1316 } 1317 return secondaryIndexEntries, nil 1318 } 1319 1320 // IndexKeyEquivSignature parses an index key if and only if the index 1321 // key belongs to a table where its equivalence signature and all its 1322 // interleave ancestors' signatures can be found in 1323 // validEquivSignatures. 1324 // 1325 // Its validEquivSignatures argument is a map containing equivalence 1326 // signatures of valid ancestors of the desired table and of the 1327 // desired table itself. 1328 // 1329 // IndexKeyEquivSignature returns whether or not the index key 1330 // satisfies the above condition, the value mapped to by the desired 1331 // table (could be a table index), and the rest of the key that's not 1332 // part of the signature. 1333 // 1334 // It also requires two []byte buffers: one for the signature 1335 // (signatureBuf) and one for the rest of the key (keyRestBuf). 1336 // 1337 // The equivalence signature defines the equivalence classes for the 1338 // signature of potentially interleaved tables. For example, the 1339 // equivalence signatures for the following interleaved indexes: 1340 // 1341 // <parent@primary> 1342 // <child@secondary> 1343 // 1344 // and index keys 1345 // <parent index key>: /<parent table id>/<parent index id>/<val 1>/<val 2> 1346 // <child index key>: /<parent table id>/<parent index id>/<val 1>/<val 2>/#/<child table id>/child index id>/<val 3>/<val 4> 1347 // 1348 // correspond to the equivalence signatures 1349 // <parent@primary>: /<parent table id>/<parent index id> 1350 // <child@secondary>: /<parent table id>/<parent index id>/#/<child table id>/<child index id> 1351 // 1352 // Equivalence signatures allow us to associate an index key with its 1353 // table without having to invoke DecodeIndexKey multiple times. 1354 // 1355 // IndexKeyEquivSignature will return false if the a table's 1356 // ancestor's signature or the table's signature (table which the 1357 // index key belongs to) is not mapped in validEquivSignatures. 1358 // 1359 // For example, suppose the given key is 1360 // 1361 // /<t2 table id>/<t2 index id>/<val t2>/#/<t3 table id>/<t3 table id>/<val t3> 1362 // 1363 // and validEquivSignatures contains 1364 // 1365 // /<t1 table id>/t1 index id> 1366 // /<t1 table id>/t1 index id>/#/<t4 table id>/<t4 index id> 1367 // 1368 // IndexKeyEquivSignature will short-circuit and return false once 1369 // 1370 // /<t2 table id>/<t2 index id> 1371 // 1372 // is processed since t2's signature is not specified in validEquivSignatures. 1373 func IndexKeyEquivSignature( 1374 key []byte, validEquivSignatures map[string]int, signatureBuf []byte, restBuf []byte, 1375 ) (tableIdx int, restResult []byte, success bool, err error) { 1376 signatureBuf = signatureBuf[:0] 1377 restResult = restBuf[:0] 1378 for { 1379 // Well-formed key is guaranteed to to have 2 varints for every 1380 // ancestor: the TableID and IndexID. 1381 // We extract these out and add them to our buffer. 1382 for i := 0; i < 2; i++ { 1383 idLen, err := encoding.PeekLength(key) 1384 if err != nil { 1385 return 0, nil, false, err 1386 } 1387 signatureBuf = append(signatureBuf, key[:idLen]...) 1388 key = key[idLen:] 1389 } 1390 1391 // The current signature (either an ancestor table's or the key's) 1392 // is not one of the validEquivSignatures. 1393 // We can short-circuit and return false. 1394 recentTableIdx, found := validEquivSignatures[string(signatureBuf)] 1395 if !found { 1396 return 0, nil, false, nil 1397 } 1398 1399 var isSentinel bool 1400 // Peek and discard encoded index values. 1401 for { 1402 key, isSentinel = encoding.DecodeIfInterleavedSentinel(key) 1403 // We stop once the key is empty or if we encounter a 1404 // sentinel for the next TableID-IndexID pair. 1405 if len(key) == 0 || isSentinel { 1406 break 1407 } 1408 len, err := encoding.PeekLength(key) 1409 if err != nil { 1410 return 0, nil, false, err 1411 } 1412 // Append any other bytes (column values initially, 1413 // then family ID and timestamp) to return. 1414 restResult = append(restResult, key[:len]...) 1415 key = key[len:] 1416 } 1417 1418 if !isSentinel { 1419 // The key has been fully decomposed and is valid up to 1420 // this point. 1421 // Return the most recent table index from 1422 // validEquivSignatures. 1423 return recentTableIdx, restResult, true, nil 1424 } 1425 // If there was a sentinel, we know there are more 1426 // descendant(s). 1427 // We insert an interleave sentinel and continue extracting the 1428 // next descendant's IDs. 1429 signatureBuf = encoding.EncodeInterleavedSentinel(signatureBuf) 1430 } 1431 } 1432 1433 // TableEquivSignatures returns the equivalence signatures for each interleave 1434 // ancestor and itself. See IndexKeyEquivSignature for more info. 1435 func TableEquivSignatures( 1436 desc *TableDescriptor, index *IndexDescriptor, 1437 ) (signatures [][]byte, err error) { 1438 // signatures contains the slice reference to the signature of every 1439 // ancestor of the current table-index. 1440 // The last slice reference is the given table-index's signature. 1441 signatures = make([][]byte, len(index.Interleave.Ancestors)+1) 1442 // fullSignature is the backing byte slice for each individual signature 1443 // as it buffers each block of table and index IDs. 1444 // We eagerly allocate 4 bytes for each of the two IDs per ancestor 1445 // (which can fit Uvarint IDs up to 2^17-1 without another allocation), 1446 // 1 byte for each interleave sentinel, and 4 bytes each for the given 1447 // table's and index's ID. 1448 fullSignature := make([]byte, 0, len(index.Interleave.Ancestors)*9+8) 1449 1450 // Encode the table's ancestors' TableIDs and IndexIDs. 1451 for i, ancestor := range index.Interleave.Ancestors { 1452 fullSignature = EncodePartialTableIDIndexID(fullSignature, ancestor.TableID, ancestor.IndexID) 1453 // Create a reference up to this point for the ancestor's 1454 // signature. 1455 signatures[i] = fullSignature 1456 // Append Interleave sentinel after every ancestor. 1457 fullSignature = encoding.EncodeInterleavedSentinel(fullSignature) 1458 } 1459 1460 // Encode the table's table and index IDs. 1461 fullSignature = EncodePartialTableIDIndexID(fullSignature, desc.ID, index.ID) 1462 // Create a reference for the given table's signature as the last 1463 // element of signatures. 1464 signatures[len(signatures)-1] = fullSignature 1465 1466 return signatures, nil 1467 } 1468 1469 // maxKeyTokens returns the maximum number of key tokens in an index's key, 1470 // including the table ID, index ID, and index column values (including extra 1471 // columns that may be stored in the key). 1472 // It requires knowledge of whether the key will or might contain a NULL value: 1473 // if uncertain, pass in true to 'overestimate' the maxKeyTokens. 1474 // 1475 // In general, a key belonging to an interleaved index grandchild is encoded as: 1476 // 1477 // /table/index/<parent-pk1>/.../<parent-pkX>/#/table/index/<child-pk1>/.../<child-pkY>/#/table/index/<grandchild-pk1>/.../<grandchild-pkZ> 1478 // 1479 // The part of the key with respect to the grandchild index would be 1480 // the entire key since there are no grand-grandchild table/index IDs or 1481 // <grandgrandchild-pk>. The maximal prefix of the key that belongs to child is 1482 // 1483 // /table/index/<parent-pk1>/.../<parent-pkX>/#/table/index/<child-pk1>/.../<child-pkY> 1484 // 1485 // and the maximal prefix of the key that belongs to parent is 1486 // 1487 // /table/index/<parent-pk1>/.../<parent-pkX> 1488 // 1489 // This returns the maximum number of <tokens> in this prefix. 1490 func maxKeyTokens(index *IndexDescriptor, containsNull bool) int { 1491 nTables := len(index.Interleave.Ancestors) + 1 1492 nKeyCols := len(index.ColumnIDs) 1493 1494 // Non-unique secondary indexes or unique secondary indexes with a NULL 1495 // value have additional columns in the key that may appear in a span 1496 // (e.g. primary key columns not part of the index). 1497 // See EncodeSecondaryIndex. 1498 if !index.Unique || containsNull { 1499 nKeyCols += len(index.ExtraColumnIDs) 1500 } 1501 1502 // To illustrate how we compute max # of key tokens, take the 1503 // key in the example above and let the respective index be child. 1504 // We'd like to return the number of bytes in 1505 // 1506 // /table/index/<parent-pk1>/.../<parent-pkX>/#/table/index/<child-pk1>/.../<child-pkY> 1507 // For each table-index, there is 1508 // 1. table ID 1509 // 2. index ID 1510 // 3. interleave sentinel 1511 // or 3 * nTables. 1512 // Each <parent-pkX> must be a part of the index's columns (nKeys). 1513 // Finally, we do not want to include the interleave sentinel for the 1514 // current index (-1). 1515 return 3*nTables + nKeyCols - 1 1516 } 1517 1518 // AdjustStartKeyForInterleave adjusts the start key to skip unnecessary 1519 // interleaved sections. 1520 // 1521 // For example, if child is interleaved into parent, a typical parent 1522 // span might look like 1523 // /1 - /3 1524 // and a typical child span might look like 1525 // /1/#/2 - /2/#/5 1526 // Suppose the parent span is 1527 // /1/#/2 - /3 1528 // where the start key is a child's index key. Notice that the first parent 1529 // key read actually starts at /2 since all the parent keys with the prefix 1530 // /1 come before the child key /1/#/2 (and is not read in the span). 1531 // We can thus push forward the start key from /1/#/2 to /2. If the start key 1532 // was /1, we cannot push this forwards since that is the first key we want 1533 // to read. 1534 func AdjustStartKeyForInterleave( 1535 codec keys.SQLCodec, index *IndexDescriptor, start roachpb.Key, 1536 ) (roachpb.Key, error) { 1537 // Remove the tenant prefix before decomposing. 1538 strippedStart, err := codec.StripTenantPrefix(start) 1539 if err != nil { 1540 return roachpb.Key{}, err 1541 } 1542 1543 keyTokens, containsNull, err := encoding.DecomposeKeyTokens(strippedStart) 1544 if err != nil { 1545 return roachpb.Key{}, err 1546 } 1547 nIndexTokens := maxKeyTokens(index, containsNull) 1548 1549 // This is either the index's own key or one of its ancestor's key. 1550 // Nothing to do. 1551 if len(keyTokens) <= nIndexTokens { 1552 return start, nil 1553 } 1554 1555 // len(keyTokens) > nIndexTokens, so this must be a child key. 1556 // Transform /1/#/2 --> /2. 1557 firstNTokenLen := 0 1558 for _, token := range keyTokens[:nIndexTokens] { 1559 firstNTokenLen += len(token) 1560 } 1561 1562 return start[:firstNTokenLen].PrefixEnd(), nil 1563 } 1564 1565 // AdjustEndKeyForInterleave returns an exclusive end key. It does two things: 1566 // - determines the end key based on the prior: inclusive vs exclusive 1567 // - adjusts the end key to skip unnecessary interleaved sections 1568 // 1569 // For example, the parent span composed from the filter PK >= 1 and PK < 3 is 1570 // /1 - /3 1571 // This reads all keys up to the first parent key for PK = 3. If parent had 1572 // interleaved tables and keys, it would unnecessarily scan over interleaved 1573 // rows under PK2 (e.g. /2/#/5). 1574 // We can instead "tighten" or adjust the end key from /3 to /2/#. 1575 // DO NOT pass in any keys that have been invoked with PrefixEnd: this may 1576 // cause issues when trying to decode the key tokens. 1577 // AdjustEndKeyForInterleave is idempotent upon successive invocation(s). 1578 func AdjustEndKeyForInterleave( 1579 codec keys.SQLCodec, 1580 table *TableDescriptor, 1581 index *IndexDescriptor, 1582 end roachpb.Key, 1583 inclusive bool, 1584 ) (roachpb.Key, error) { 1585 if index.Type == IndexDescriptor_INVERTED { 1586 return end.PrefixEnd(), nil 1587 } 1588 1589 // Remove the tenant prefix before decomposing. 1590 strippedEnd, err := codec.StripTenantPrefix(end) 1591 if err != nil { 1592 return roachpb.Key{}, err 1593 } 1594 1595 // To illustrate, suppose we have the interleaved hierarchy 1596 // parent 1597 // child 1598 // grandchild 1599 // Suppose our target index is child. 1600 keyTokens, containsNull, err := encoding.DecomposeKeyTokens(strippedEnd) 1601 if err != nil { 1602 return roachpb.Key{}, err 1603 } 1604 nIndexTokens := maxKeyTokens(index, containsNull) 1605 1606 // Sibling/nibling keys: it is possible for this key to be part 1607 // of a sibling tree in the interleaved hierarchy, especially after 1608 // partitioning on range split keys. 1609 // As such, a sibling may be interpretted as an ancestor (if the sibling 1610 // has fewer key-encoded columns) or a descendant (if the sibling has 1611 // more key-encoded columns). Similarly for niblings. 1612 // This is fine because if the sibling is sorted before or after the 1613 // current index (child in our example), it is not possible for us to 1614 // adjust the sibling key such that we add or remove child (the current 1615 // index's) rows from our span. 1616 1617 if index.ID != table.PrimaryIndex.ID || len(keyTokens) < nIndexTokens { 1618 // Case 1: secondary index, parent key or partial child key: 1619 // Secondary indexes cannot have interleaved rows. 1620 // We cannot adjust or tighten parent keys with respect to a 1621 // child index. 1622 // Partial child keys e.g. /1/#/1 vs /1/#/1/2 cannot have 1623 // interleaved rows. 1624 // Nothing to do besides making the end key exclusive if it was 1625 // initially inclusive. 1626 if inclusive { 1627 end = end.PrefixEnd() 1628 } 1629 return end, nil 1630 } 1631 1632 if len(keyTokens) == nIndexTokens { 1633 // Case 2: child key 1634 1635 lastToken := keyTokens[len(keyTokens)-1] 1636 _, isNotNullDesc := encoding.DecodeIfNotNullDescending(lastToken) 1637 // If this is the child's key and the last value in the key is 1638 // NotNullDesc, then it does not need (read: shouldn't) to be 1639 // tightened. 1640 // For example, the query with IS NOT NULL may generate 1641 // the end key 1642 // /1/#/NOTNULLDESC 1643 if isNotNullDesc { 1644 if inclusive { 1645 end = end.PrefixEnd() 1646 } 1647 return end, nil 1648 } 1649 1650 // We only want to UndoPrefixEnd if the end key passed is not 1651 // inclusive initially. 1652 if !inclusive { 1653 lastType := encoding.PeekType(lastToken) 1654 if lastType == encoding.Bytes || lastType == encoding.BytesDesc || lastType == encoding.Decimal { 1655 // If the last value is of type Decimals or 1656 // Bytes then this is more difficult since the 1657 // escape term is the last value. 1658 // TODO(richardwu): Figure out how to go back 1 1659 // logical bytes/decimal value. 1660 return end, nil 1661 } 1662 1663 // We first iterate back to the previous key value 1664 // /1/#/1 --> /1/#/0 1665 undoPrefixEnd, ok := encoding.UndoPrefixEnd(end) 1666 if !ok { 1667 return end, nil 1668 } 1669 end = undoPrefixEnd 1670 } 1671 1672 // /1/#/0 --> /1/#/0/# 1673 return encoding.EncodeInterleavedSentinel(end), nil 1674 } 1675 1676 // len(keyTokens) > nIndexTokens 1677 // Case 3: tightened child, sibling/nibling, or grandchild key 1678 1679 // Case 3a: tightened child key 1680 // This could from a previous invocation of AdjustEndKeyForInterleave. 1681 // For example, if during index selection the key for child was 1682 // tightened 1683 // /1/#/2 --> /1/#/1/# 1684 // We don't really want to tighten on '#' again. 1685 if _, isSentinel := encoding.DecodeIfInterleavedSentinel(keyTokens[nIndexTokens]); isSentinel && len(keyTokens)-1 == nIndexTokens { 1686 if inclusive { 1687 end = end.PrefixEnd() 1688 } 1689 return end, nil 1690 } 1691 1692 // Case 3b/c: sibling/nibling or grandchild key 1693 // Ideally, we want to form 1694 // /1/#/2/#/3 --> /1/#/2/# 1695 // We truncate up to and including the interleave sentinel (or next 1696 // sibling/nibling column value) after the last index key token. 1697 firstNTokenLen := 0 1698 for _, token := range keyTokens[:nIndexTokens] { 1699 firstNTokenLen += len(token) 1700 } 1701 1702 return end[:firstNTokenLen+1], nil 1703 }