github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/rowcontainer/disk_row_container.go (about) 1 // Copyright 2017 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package rowcontainer 12 13 import ( 14 "bytes" 15 "context" 16 17 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/diskmap" 18 "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode" 19 "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror" 20 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 21 "github.com/cockroachdb/cockroach/pkg/sql/types" 22 "github.com/cockroachdb/cockroach/pkg/util/encoding" 23 "github.com/cockroachdb/cockroach/pkg/util/log" 24 "github.com/cockroachdb/cockroach/pkg/util/mon" 25 "github.com/cockroachdb/errors" 26 ) 27 28 // DiskRowContainer is a SortableRowContainer that stores rows on disk according 29 // to the ordering specified in DiskRowContainer.ordering. The underlying store 30 // is a SortedDiskMap so the sorting itself is delegated. Use an iterator 31 // created through NewIterator() to read the rows in sorted order. 32 type DiskRowContainer struct { 33 diskMap diskmap.SortedDiskMap 34 // diskAcc keeps track of disk usage. 35 diskAcc mon.BoundAccount 36 // bufferedRows buffers writes to the diskMap. 37 bufferedRows diskmap.SortedDiskMapBatchWriter 38 scratchKey []byte 39 scratchVal []byte 40 scratchEncRow sqlbase.EncDatumRow 41 42 // For computing mean encoded row bytes. 43 totalEncodedRowBytes uint64 44 45 // lastReadKey is used to implement NewFinalIterator. Refer to the method's 46 // comment for more information. 47 lastReadKey []byte 48 49 // topK is set by callers through InitTopK. Since rows are kept in sorted 50 // order, topK will simply limit iterators to read the first k rows. 51 topK int 52 53 // rowID is used as a key suffix to prevent duplicate rows from overwriting 54 // each other. 55 rowID uint64 56 57 // types is the schema of rows in the container. 58 types []*types.T 59 // ordering is the order in which rows should be sorted. 60 ordering sqlbase.ColumnOrdering 61 // encodings keeps around the DatumEncoding equivalents of the encoding 62 // directions in ordering to avoid conversions in hot paths. 63 encodings []sqlbase.DatumEncoding 64 // valueIdxs holds the indexes of the columns that we encode as values. The 65 // columns described by ordering will be encoded as keys. See 66 // MakeDiskRowContainer() for more encoding specifics. 67 valueIdxs []int 68 69 // See comment in DoDeDuplicate(). 70 deDuplicate bool 71 // A mapping from a key to the dense row index assigned to the key. It 72 // contains all the key strings that are potentially buffered in bufferedRows. 73 // Since we need to de-duplicate for every insert attempt, we don't want to 74 // keep flushing bufferedRows after every insert. 75 // There is currently no memory-accounting for the deDupCache, just like there 76 // is none for the bufferedRows. Both will be approximately the same size. 77 deDupCache map[string]int 78 79 diskMonitor *mon.BytesMonitor 80 engine diskmap.Factory 81 82 datumAlloc *sqlbase.DatumAlloc 83 } 84 85 var _ SortableRowContainer = &DiskRowContainer{} 86 var _ DeDupingRowContainer = &DiskRowContainer{} 87 88 // MakeDiskRowContainer creates a DiskRowContainer with the given engine as the 89 // underlying store that rows are stored on. 90 // Arguments: 91 // - diskMonitor is used to monitor this DiskRowContainer's disk usage. 92 // - types is the schema of rows that will be added to this container. 93 // - ordering is the output ordering; the order in which rows should be sorted. 94 // - e is the underlying store that rows are stored on. 95 func MakeDiskRowContainer( 96 diskMonitor *mon.BytesMonitor, 97 types []*types.T, 98 ordering sqlbase.ColumnOrdering, 99 e diskmap.Factory, 100 ) DiskRowContainer { 101 diskMap := e.NewSortedDiskMap() 102 d := DiskRowContainer{ 103 diskMap: diskMap, 104 diskAcc: diskMonitor.MakeBoundAccount(), 105 types: types, 106 ordering: ordering, 107 scratchEncRow: make(sqlbase.EncDatumRow, len(types)), 108 diskMonitor: diskMonitor, 109 engine: e, 110 datumAlloc: &sqlbase.DatumAlloc{}, 111 } 112 d.bufferedRows = d.diskMap.NewBatchWriter() 113 114 // The ordering is specified for a subset of the columns. These will be 115 // encoded as a key in the given order according to the given direction so 116 // that the sorting can be delegated to the underlying SortedDiskMap. To 117 // avoid converting encoding.Direction to sqlbase.DatumEncoding we do this 118 // once at initialization and store the conversions in d.encodings. 119 // We encode the other columns as values. The indexes of these columns are 120 // kept around in d.valueIdxs to have them ready in hot paths. 121 // For composite columns that are specified in d.ordering, the Datum is 122 // encoded both in the key for comparison and in the value for decoding. 123 orderingIdxs := make(map[int]struct{}) 124 for _, orderInfo := range d.ordering { 125 orderingIdxs[orderInfo.ColIdx] = struct{}{} 126 } 127 d.valueIdxs = make([]int, 0, len(d.types)) 128 for i := range d.types { 129 // TODO(asubiotto): A datum of a type for which HasCompositeKeyEncoding 130 // returns true may not necessarily need to be encoded in the value, so 131 // make this more fine-grained. See IsComposite() methods in 132 // pkg/sql/parser/datum.go. 133 if _, ok := orderingIdxs[i]; !ok || sqlbase.HasCompositeKeyEncoding(d.types[i]) { 134 d.valueIdxs = append(d.valueIdxs, i) 135 } 136 } 137 138 d.encodings = make([]sqlbase.DatumEncoding, len(d.ordering)) 139 for i, orderInfo := range ordering { 140 d.encodings[i] = sqlbase.EncodingDirToDatumEncoding(orderInfo.Direction) 141 } 142 143 return d 144 } 145 146 // DoDeDuplicate causes DiskRowContainer to behave as an implementation of 147 // DeDupingRowContainer. It should not be mixed with calls to AddRow() (except 148 // when the AddRow() already represent deduplicated rows). It de-duplicates 149 // the keys such that only the first row with the given key will be stored. 150 // The index returned in AddRowWithDedup() is a dense index starting from 0, 151 // representing when that key was first added. This feature does not combine 152 // with Sort(), Reorder() etc., and only to be used for assignment of these 153 // dense indexes. The main reason to add this to DiskBackedRowContainer is to 154 // avoid significant code duplication in constructing another row container. 155 func (d *DiskRowContainer) DoDeDuplicate() { 156 d.deDuplicate = true 157 d.deDupCache = make(map[string]int) 158 } 159 160 // Len is part of the SortableRowContainer interface. 161 func (d *DiskRowContainer) Len() int { 162 return int(d.rowID) 163 } 164 165 // AddRow is part of the SortableRowContainer interface. 166 // 167 // It is additionally used in de-duping mode by DiskBackedRowContainer when 168 // switching from a memory container to this disk container, since it is 169 // adding rows that are already de-duped. Once it has added all the already 170 // de-duped rows, it should switch to using AddRowWithDeDup() and never call 171 // AddRow() again. 172 // 173 // Note: if key calculation changes, computeKey() of hashMemRowIterator should 174 // be changed accordingly. 175 func (d *DiskRowContainer) AddRow(ctx context.Context, row sqlbase.EncDatumRow) error { 176 if err := d.encodeRow(ctx, row); err != nil { 177 return err 178 } 179 if err := d.diskAcc.Grow(ctx, int64(len(d.scratchKey)+len(d.scratchVal))); err != nil { 180 return pgerror.Wrapf(err, pgcode.OutOfMemory, 181 "this query requires additional disk space") 182 } 183 if err := d.bufferedRows.Put(d.scratchKey, d.scratchVal); err != nil { 184 return err 185 } 186 // See comment above on when this is used for already de-duplicated 187 // rows -- we need to track these in the de-dup cache so that later 188 // calls to AddRowWithDeDup() de-duplicate wrt this cache. 189 if d.deDuplicate { 190 if d.bufferedRows.NumPutsSinceFlush() == 0 { 191 d.clearDeDupCache() 192 } else { 193 d.deDupCache[string(d.scratchKey)] = int(d.rowID) 194 } 195 } 196 d.totalEncodedRowBytes += uint64(len(d.scratchKey) + len(d.scratchVal)) 197 d.scratchKey = d.scratchKey[:0] 198 d.scratchVal = d.scratchVal[:0] 199 d.rowID++ 200 return nil 201 } 202 203 // AddRowWithDeDup is part of the DeDupingRowContainer interface. 204 func (d *DiskRowContainer) AddRowWithDeDup( 205 ctx context.Context, row sqlbase.EncDatumRow, 206 ) (int, error) { 207 if err := d.encodeRow(ctx, row); err != nil { 208 return 0, err 209 } 210 defer func() { 211 d.scratchKey = d.scratchKey[:0] 212 d.scratchVal = d.scratchVal[:0] 213 }() 214 // First use the cache to de-dup. 215 entry, ok := d.deDupCache[string(d.scratchKey)] 216 if ok { 217 return entry, nil 218 } 219 // Since not in cache, we need to use an iterator to de-dup. 220 // TODO(sumeer): this read is expensive: 221 // - if there is a significant fraction of duplicates, we can do better 222 // with a larger cache 223 // - if duplicates are rare, use a bloom filter for all the keys in the 224 // diskMap, since a miss in the bloom filter allows us to write to the 225 // diskMap without reading. 226 iter := d.diskMap.NewIterator() 227 defer iter.Close() 228 iter.SeekGE(d.scratchKey) 229 valid, err := iter.Valid() 230 if err != nil { 231 return 0, err 232 } 233 if valid && bytes.Equal(iter.UnsafeKey(), d.scratchKey) { 234 // Found the key. Note that as documented in DeDupingRowContainer, 235 // this feature is limited to the case where the whole row is 236 // encoded into the key. The value only contains the dense RowID 237 // assigned to the key. 238 _, idx, err := encoding.DecodeUvarintAscending(iter.UnsafeValue()) 239 if err != nil { 240 return 0, err 241 } 242 return int(idx), nil 243 } 244 if err := d.diskAcc.Grow(ctx, int64(len(d.scratchKey)+len(d.scratchVal))); err != nil { 245 return 0, pgerror.Wrapf(err, pgcode.OutOfMemory, 246 "this query requires additional disk space") 247 } 248 if err := d.bufferedRows.Put(d.scratchKey, d.scratchVal); err != nil { 249 return 0, err 250 } 251 if d.bufferedRows.NumPutsSinceFlush() == 0 { 252 d.clearDeDupCache() 253 } else { 254 d.deDupCache[string(d.scratchKey)] = int(d.rowID) 255 } 256 d.totalEncodedRowBytes += uint64(len(d.scratchKey) + len(d.scratchVal)) 257 idx := int(d.rowID) 258 d.rowID++ 259 return idx, nil 260 } 261 262 func (d *DiskRowContainer) clearDeDupCache() { 263 for k := range d.deDupCache { 264 delete(d.deDupCache, k) 265 } 266 } 267 268 func (d *DiskRowContainer) testingFlushBuffer(ctx context.Context) { 269 if err := d.bufferedRows.Flush(); err != nil { 270 log.Fatalf(ctx, "%v", err) 271 } 272 d.clearDeDupCache() 273 } 274 275 func (d *DiskRowContainer) encodeRow(ctx context.Context, row sqlbase.EncDatumRow) error { 276 if len(row) != len(d.types) { 277 log.Fatalf(ctx, "invalid row length %d, expected %d", len(row), len(d.types)) 278 } 279 280 for i, orderInfo := range d.ordering { 281 col := orderInfo.ColIdx 282 var err error 283 d.scratchKey, err = row[col].Encode(d.types[col], d.datumAlloc, d.encodings[i], d.scratchKey) 284 if err != nil { 285 return err 286 } 287 } 288 if !d.deDuplicate { 289 for _, i := range d.valueIdxs { 290 var err error 291 d.scratchVal, err = row[i].Encode(d.types[i], d.datumAlloc, sqlbase.DatumEncoding_VALUE, d.scratchVal) 292 if err != nil { 293 return err 294 } 295 } 296 // Put a unique row to keep track of duplicates. Note that this will not 297 // mess with key decoding. 298 d.scratchKey = encoding.EncodeUvarintAscending(d.scratchKey, d.rowID) 299 } else { 300 // Add the row id to the value. Note that in this de-duping case the 301 // row id is the only thing in the value since the whole row is encoded 302 // into the key. Note that the key could have types for which 303 // HasCompositeKeyEncoding() returns true and we do not encode them 304 // into the value (only in the key) for this DeDupingRowContainer. This 305 // is ok since: 306 // - The DeDupingRowContainer never needs to return the original row 307 // (there is no get method). 308 // - The columns encoded into the key are the primary key columns 309 // of the original table, so the key encoding represents a unique 310 // row in the original table (the key encoding here is not only 311 // a determinant of sort ordering). 312 d.scratchVal = encoding.EncodeUvarintAscending(d.scratchVal, d.rowID) 313 } 314 return nil 315 } 316 317 // Sort is a noop because the use of a SortedDiskMap as the underlying store 318 // keeps the rows in sorted order. 319 func (d *DiskRowContainer) Sort(context.Context) {} 320 321 // Reorder implements ReorderableRowContainer. It creates a new 322 // DiskRowContainer with the requested ordering and adds a row one by one from 323 // the current DiskRowContainer, the latter is closed at the end. 324 func (d *DiskRowContainer) Reorder(ctx context.Context, ordering sqlbase.ColumnOrdering) error { 325 // We need to create a new DiskRowContainer since its ordering can only be 326 // changed at initialization. 327 newContainer := MakeDiskRowContainer(d.diskMonitor, d.types, ordering, d.engine) 328 i := d.NewFinalIterator(ctx) 329 defer i.Close() 330 for i.Rewind(); ; i.Next() { 331 if ok, err := i.Valid(); err != nil { 332 return err 333 } else if !ok { 334 break 335 } 336 row, err := i.Row() 337 if err != nil { 338 return err 339 } 340 if err := newContainer.AddRow(ctx, row); err != nil { 341 return err 342 } 343 } 344 d.Close(ctx) 345 *d = newContainer 346 return nil 347 } 348 349 // InitTopK limits iterators to read the first k rows. 350 func (d *DiskRowContainer) InitTopK() { 351 d.topK = d.Len() 352 } 353 354 // MaybeReplaceMax adds row to the DiskRowContainer. The SortedDiskMap will 355 // sort this row into the top k if applicable. 356 func (d *DiskRowContainer) MaybeReplaceMax(ctx context.Context, row sqlbase.EncDatumRow) error { 357 return d.AddRow(ctx, row) 358 } 359 360 // MeanEncodedRowBytes returns the mean bytes consumed by an encoded row stored in 361 // this container. 362 func (d *DiskRowContainer) MeanEncodedRowBytes() int { 363 if d.rowID == 0 { 364 return 0 365 } 366 return int(d.totalEncodedRowBytes / d.rowID) 367 } 368 369 // UnsafeReset is part of the SortableRowContainer interface. 370 func (d *DiskRowContainer) UnsafeReset(ctx context.Context) error { 371 _ = d.bufferedRows.Close(ctx) 372 if err := d.diskMap.Clear(); err != nil { 373 return err 374 } 375 d.diskAcc.Clear(ctx) 376 d.bufferedRows = d.diskMap.NewBatchWriter() 377 d.clearDeDupCache() 378 d.lastReadKey = nil 379 d.rowID = 0 380 d.totalEncodedRowBytes = 0 381 return nil 382 } 383 384 // Close is part of the SortableRowContainer interface. 385 func (d *DiskRowContainer) Close(ctx context.Context) { 386 // We can ignore the error here because the flushed data is immediately cleared 387 // in the following Close. 388 _ = d.bufferedRows.Close(ctx) 389 d.diskMap.Close(ctx) 390 d.diskAcc.Close(ctx) 391 } 392 393 // keyValToRow decodes a key and a value byte slice stored with AddRow() into 394 // a sqlbase.EncDatumRow. The returned EncDatumRow is only valid until the next 395 // call to keyValToRow(). 396 func (d *DiskRowContainer) keyValToRow(k []byte, v []byte) (sqlbase.EncDatumRow, error) { 397 for i, orderInfo := range d.ordering { 398 // Types with composite key encodings are decoded from the value. 399 if sqlbase.HasCompositeKeyEncoding(d.types[orderInfo.ColIdx]) { 400 // Skip over the encoded key. 401 encLen, err := encoding.PeekLength(k) 402 if err != nil { 403 return nil, err 404 } 405 k = k[encLen:] 406 continue 407 } 408 var err error 409 col := orderInfo.ColIdx 410 d.scratchEncRow[col], k, err = sqlbase.EncDatumFromBuffer(d.types[col], d.encodings[i], k) 411 if err != nil { 412 return nil, errors.NewAssertionErrorWithWrappedErrf(err, 413 "unable to decode row, column idx %d", errors.Safe(col)) 414 } 415 } 416 for _, i := range d.valueIdxs { 417 var err error 418 d.scratchEncRow[i], v, err = sqlbase.EncDatumFromBuffer(d.types[i], sqlbase.DatumEncoding_VALUE, v) 419 if err != nil { 420 return nil, errors.NewAssertionErrorWithWrappedErrf(err, 421 "unable to decode row, value idx %d", errors.Safe(i)) 422 } 423 } 424 return d.scratchEncRow, nil 425 } 426 427 // diskRowIterator iterates over the rows in a DiskRowContainer. 428 type diskRowIterator struct { 429 rowContainer *DiskRowContainer 430 rowBuf []byte 431 diskmap.SortedDiskMapIterator 432 } 433 434 var _ RowIterator = &diskRowIterator{} 435 436 func (d *DiskRowContainer) newIterator(ctx context.Context) diskRowIterator { 437 if err := d.bufferedRows.Flush(); err != nil { 438 log.Fatalf(ctx, "%v", err) 439 } 440 return diskRowIterator{rowContainer: d, SortedDiskMapIterator: d.diskMap.NewIterator()} 441 } 442 443 //NewIterator is part of the SortableRowContainer interface. 444 func (d *DiskRowContainer) NewIterator(ctx context.Context) RowIterator { 445 i := d.newIterator(ctx) 446 if d.topK > 0 { 447 return &diskRowTopKIterator{RowIterator: &i, k: d.topK} 448 } 449 return &i 450 } 451 452 // Row returns the current row. The returned sqlbase.EncDatumRow is only valid 453 // until the next call to Row(). 454 func (r *diskRowIterator) Row() (sqlbase.EncDatumRow, error) { 455 if ok, err := r.Valid(); err != nil { 456 return nil, errors.NewAssertionErrorWithWrappedErrf(err, "unable to check row validity") 457 } else if !ok { 458 return nil, errors.AssertionFailedf("invalid row") 459 } 460 461 k := r.UnsafeKey() 462 v := r.UnsafeValue() 463 // TODO(asubiotto): the "true ||" should not be necessary. We should be to 464 // reuse rowBuf, yet doing so causes 465 // TestDiskBackedIndexedRowContainer/ReorderingOnDisk, TestHashJoiner, and 466 // TestSorter to fail. Some caller of Row() is presumably not making a copy 467 // of the return value. 468 if true || cap(r.rowBuf) < len(k)+len(v) { 469 r.rowBuf = make([]byte, 0, len(k)+len(v)) 470 } 471 r.rowBuf = r.rowBuf[:len(k)+len(v)] 472 copy(r.rowBuf, k) 473 copy(r.rowBuf[len(k):], v) 474 k = r.rowBuf[:len(k)] 475 v = r.rowBuf[len(k):] 476 477 return r.rowContainer.keyValToRow(k, v) 478 } 479 480 func (r *diskRowIterator) Close() { 481 if r.SortedDiskMapIterator != nil { 482 r.SortedDiskMapIterator.Close() 483 } 484 } 485 486 // numberedRowIterator is a specialization of diskRowIterator that is 487 // only for the case where the key is the rowID assigned in AddRow(). 488 type numberedRowIterator struct { 489 *diskRowIterator 490 scratchKey []byte 491 } 492 493 func (d *DiskRowContainer) newNumberedIterator(ctx context.Context) *numberedRowIterator { 494 i := d.newIterator(ctx) 495 return &numberedRowIterator{diskRowIterator: &i} 496 } 497 498 func (n numberedRowIterator) seekToIndex(idx int) { 499 n.scratchKey = encoding.EncodeUvarintAscending(n.scratchKey, uint64(idx)) 500 n.SeekGE(n.scratchKey) 501 } 502 503 type diskRowFinalIterator struct { 504 diskRowIterator 505 } 506 507 var _ RowIterator = &diskRowFinalIterator{} 508 509 // NewFinalIterator returns an iterator that reads rows exactly once throughout 510 // the lifetime of a DiskRowContainer. Rows are not actually discarded from the 511 // DiskRowContainer, but the lastReadKey is kept track of in order to serve as 512 // the start key for future diskRowFinalIterators. 513 // NOTE: Don't use NewFinalIterator if you passed in an ordering for the rows 514 // and will be adding rows between iterations. New rows could sort before the 515 // current row. 516 func (d *DiskRowContainer) NewFinalIterator(ctx context.Context) RowIterator { 517 i := diskRowFinalIterator{diskRowIterator: d.newIterator(ctx)} 518 if d.topK > 0 { 519 return &diskRowTopKIterator{RowIterator: &i, k: d.topK} 520 } 521 return &i 522 } 523 524 func (r *diskRowFinalIterator) Rewind() { 525 r.SeekGE(r.diskRowIterator.rowContainer.lastReadKey) 526 if r.diskRowIterator.rowContainer.lastReadKey != nil { 527 r.Next() 528 } 529 } 530 531 func (r *diskRowFinalIterator) Row() (sqlbase.EncDatumRow, error) { 532 row, err := r.diskRowIterator.Row() 533 if err != nil { 534 return nil, err 535 } 536 r.diskRowIterator.rowContainer.lastReadKey = 537 append(r.diskRowIterator.rowContainer.lastReadKey[:0], r.UnsafeKey()...) 538 return row, nil 539 } 540 541 type diskRowTopKIterator struct { 542 RowIterator 543 position int 544 // k is the limit of rows to read. 545 k int 546 } 547 548 var _ RowIterator = &diskRowTopKIterator{} 549 550 func (d *diskRowTopKIterator) Rewind() { 551 d.RowIterator.Rewind() 552 d.position = 0 553 } 554 555 func (d *diskRowTopKIterator) Valid() (bool, error) { 556 if d.position >= d.k { 557 return false, nil 558 } 559 return d.RowIterator.Valid() 560 } 561 562 func (d *diskRowTopKIterator) Next() { 563 d.position++ 564 d.RowIterator.Next() 565 }