github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/storage/disk_map.go (about) 1 // Copyright 2017 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package storage 12 13 import ( 14 "bytes" 15 "context" 16 "sync/atomic" 17 18 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/diskmap" 19 "github.com/cockroachdb/cockroach/pkg/roachpb" 20 "github.com/cockroachdb/cockroach/pkg/util/encoding" 21 "github.com/cockroachdb/cockroach/pkg/util/log" 22 "github.com/cockroachdb/errors" 23 "github.com/cockroachdb/pebble" 24 ) 25 26 // defaultBatchCapacityBytes is the default capacity for a 27 // SortedDiskMapBatchWriter. 28 const defaultBatchCapacityBytes = 4096 29 30 // rocksDBMapBatchWriter batches writes to a RocksDBMap. 31 type rocksDBMapBatchWriter struct { 32 // capacity is the number of bytes to write before a Flush() is triggered. 33 capacity int 34 35 // makeKey is a function that transforms a key into an MVCCKey with a prefix 36 // to be written to the underlying store. 37 makeKey func(k []byte) MVCCKey 38 batch Batch 39 numPutsSinceFlush int 40 store Engine 41 } 42 43 // rocksDBMapIterator iterates over the keys of a RocksDBMap in sorted order. 44 type rocksDBMapIterator struct { 45 iter Iterator 46 // makeKey is a function that transforms a key into an MVCCKey with a prefix 47 // used to SeekGE() the underlying iterator. 48 makeKey func(k []byte) MVCCKey 49 // prefix is the prefix of keys that this iterator iterates over. 50 prefix []byte 51 } 52 53 // rocksDBMap is a SortedDiskMap that uses RocksDB as its underlying storage 54 // engine. 55 type rocksDBMap struct { 56 // TODO(asubiotto): Add memory accounting. 57 prefix []byte 58 store Engine 59 allowDuplicates bool 60 keyID int64 61 } 62 63 var _ diskmap.SortedDiskMapBatchWriter = &rocksDBMapBatchWriter{} 64 var _ diskmap.SortedDiskMapIterator = &rocksDBMapIterator{} 65 var _ diskmap.SortedDiskMap = &rocksDBMap{} 66 67 // tempStorageID is the temp ID generator for a node. It generates unique 68 // prefixes for NewRocksDBMap. It is a global because newRocksDBMap needs to 69 // prefix its writes uniquely, and using a global prevents users from having to 70 // specify the prefix themselves and correctly guarantee that it is unique. 71 var tempStorageID uint64 72 73 func generateTempStorageID() uint64 { 74 return atomic.AddUint64(&tempStorageID, 1) 75 } 76 77 // newRocksDBMap creates a new rocksDBMap with the passed in Engine as the 78 // underlying store. The rocksDBMap instance will have a keyspace prefixed by a 79 // unique prefix. The allowDuplicates parameter controls whether Puts with 80 // identical keys will write multiple entries or overwrite previous entries. 81 func newRocksDBMap(e Engine, allowDuplicates bool) *rocksDBMap { 82 prefix := generateTempStorageID() 83 return &rocksDBMap{ 84 prefix: encoding.EncodeUvarintAscending([]byte(nil), prefix), 85 store: e, 86 allowDuplicates: allowDuplicates, 87 } 88 } 89 90 // makeKey appends k to the rocksDBMap's prefix to keep the key local to this 91 // instance and creates an MVCCKey, which is what the underlying storage engine 92 // expects. The returned key is only valid until the next call to makeKey(). 93 func (r *rocksDBMap) makeKey(k []byte) MVCCKey { 94 // TODO(asubiotto): We can make this more performant by bypassing MVCCKey 95 // creation (have to generalize storage API). See 96 // https://github.com/cockroachdb/cockroach/issues/16718#issuecomment-311493414 97 prefixLen := len(r.prefix) 98 r.prefix = append(r.prefix, k...) 99 mvccKey := MVCCKey{Key: r.prefix} 100 r.prefix = r.prefix[:prefixLen] 101 return mvccKey 102 } 103 104 // makeKeyWithTimestamp makes a key appropriate for a Put operation. It is like 105 // makeKey except it respects allowDuplicates, which uses the MVCC timestamp 106 // field to assign a unique keyID so duplicate keys don't overwrite each other. 107 func (r *rocksDBMap) makeKeyWithTimestamp(k []byte) MVCCKey { 108 mvccKey := r.makeKey(k) 109 if r.allowDuplicates { 110 r.keyID++ 111 mvccKey.Timestamp.WallTime = r.keyID 112 } 113 return mvccKey 114 } 115 116 // NewIterator implements the SortedDiskMap interface. 117 func (r *rocksDBMap) NewIterator() diskmap.SortedDiskMapIterator { 118 // NOTE: prefix is only false because we can't use the normal prefix 119 // extractor. This iterator still only does prefix iteration. See 120 // rocksDBMapIterator.Valid(). 121 return &rocksDBMapIterator{ 122 iter: r.store.NewIterator(IterOptions{ 123 UpperBound: roachpb.Key(r.prefix).PrefixEnd(), 124 }), 125 makeKey: r.makeKey, 126 prefix: r.prefix, 127 } 128 } 129 130 // NewBatchWriter implements the SortedDiskMap interface. 131 func (r *rocksDBMap) NewBatchWriter() diskmap.SortedDiskMapBatchWriter { 132 return r.NewBatchWriterCapacity(defaultBatchCapacityBytes) 133 } 134 135 // NewBatchWriterCapacity implements the SortedDiskMap interface. 136 func (r *rocksDBMap) NewBatchWriterCapacity(capacityBytes int) diskmap.SortedDiskMapBatchWriter { 137 makeKey := r.makeKey 138 if r.allowDuplicates { 139 makeKey = r.makeKeyWithTimestamp 140 } 141 return &rocksDBMapBatchWriter{ 142 capacity: capacityBytes, 143 makeKey: makeKey, 144 batch: r.store.NewWriteOnlyBatch(), 145 store: r.store, 146 } 147 } 148 149 // Clear implements the SortedDiskMap interface. 150 func (r *rocksDBMap) Clear() error { 151 if err := r.store.ClearRange( 152 MVCCKey{Key: r.prefix}, 153 MVCCKey{Key: roachpb.Key(r.prefix).PrefixEnd()}, 154 ); err != nil { 155 return errors.Wrapf(err, "unable to clear range with prefix %v", r.prefix) 156 } 157 // NB: we manually flush after performing the clear range to ensure that the 158 // range tombstone is pushed to disk which will kick off compactions that 159 // will eventually free up the deleted space. 160 return r.store.Flush() 161 } 162 163 // Close implements the SortedDiskMap interface. 164 func (r *rocksDBMap) Close(ctx context.Context) { 165 if err := r.Clear(); err != nil { 166 log.Errorf(ctx, "%v", err) 167 } 168 } 169 170 // SeekGE implements the SortedDiskMapIterator interface. 171 func (i *rocksDBMapIterator) SeekGE(k []byte) { 172 i.iter.SeekGE(i.makeKey(k)) 173 } 174 175 // Rewind implements the SortedDiskMapIterator interface. 176 func (i *rocksDBMapIterator) Rewind() { 177 i.iter.SeekGE(i.makeKey(nil)) 178 } 179 180 // Valid implements the SortedDiskMapIterator interface. 181 func (i *rocksDBMapIterator) Valid() (bool, error) { 182 ok, err := i.iter.Valid() 183 if err != nil { 184 return false, err 185 } 186 if ok && !bytes.HasPrefix(i.iter.UnsafeKey().Key, i.prefix) { 187 return false, nil 188 } 189 190 return ok, nil 191 } 192 193 // Next implements the SortedDiskMapIterator interface. 194 func (i *rocksDBMapIterator) Next() { 195 i.iter.Next() 196 } 197 198 // UnsafeKey implements the SortedDiskMapIterator interface. 199 func (i *rocksDBMapIterator) UnsafeKey() []byte { 200 return i.iter.UnsafeKey().Key[len(i.prefix):] 201 } 202 203 // UnsafeValue implements the SortedDiskMapIterator interface. 204 func (i *rocksDBMapIterator) UnsafeValue() []byte { 205 return i.iter.UnsafeValue() 206 } 207 208 // Close implements the SortedDiskMapIterator interface. 209 func (i *rocksDBMapIterator) Close() { 210 i.iter.Close() 211 } 212 213 // Put implements the SortedDiskMapBatchWriter interface. 214 func (b *rocksDBMapBatchWriter) Put(k []byte, v []byte) error { 215 if err := b.batch.Put(b.makeKey(k), v); err != nil { 216 return err 217 } 218 b.numPutsSinceFlush++ 219 if b.batch.Len() >= b.capacity { 220 return b.Flush() 221 } 222 return nil 223 } 224 225 // Flush implements the SortedDiskMapBatchWriter interface. 226 func (b *rocksDBMapBatchWriter) Flush() error { 227 if b.batch.Empty() { 228 return nil 229 } 230 if err := b.batch.Commit(false /* syncCommit */); err != nil { 231 return err 232 } 233 b.numPutsSinceFlush = 0 234 b.batch = b.store.NewWriteOnlyBatch() 235 return nil 236 } 237 238 // NumPutsSinceFlush implements the SortedDiskMapBatchWriter interface. 239 func (b *rocksDBMapBatchWriter) NumPutsSinceFlush() int { 240 return b.numPutsSinceFlush 241 } 242 243 // Close implements the SortedDiskMapBatchWriter interface. 244 func (b *rocksDBMapBatchWriter) Close(ctx context.Context) error { 245 err := b.Flush() 246 b.batch.Close() 247 return err 248 } 249 250 // pebbleMapBatchWriter batches writes to a pebbleMap. 251 type pebbleMapBatchWriter struct { 252 // capacity is the number of bytes to write before a Flush() is triggered. 253 capacity int 254 255 // makeKey is a function that transforms a key into a byte slice with a prefix 256 // to be written to the underlying store. 257 makeKey func(k []byte) []byte 258 batch *pebble.Batch 259 numPutsSinceFlush int 260 store *pebble.DB 261 } 262 263 // pebbleMapIterator iterates over the keys of a pebbleMap in sorted order. 264 type pebbleMapIterator struct { 265 allowDuplicates bool 266 iter *pebble.Iterator 267 // makeKey is a function that transforms a key into a byte slice with a prefix 268 // used to SeekGE() the underlying iterator. 269 makeKey func(k []byte) []byte 270 // prefix is the prefix of keys that this iterator iterates over. 271 prefix []byte 272 } 273 274 // pebbleMap is a SortedDiskMap, similar to rocksDBMap, that uses pebble as its 275 // underlying storage engine. 276 type pebbleMap struct { 277 prefix []byte 278 store *pebble.DB 279 allowDuplicates bool 280 keyID int64 281 } 282 283 var _ diskmap.SortedDiskMapBatchWriter = &pebbleMapBatchWriter{} 284 var _ diskmap.SortedDiskMapIterator = &pebbleMapIterator{} 285 var _ diskmap.SortedDiskMap = &pebbleMap{} 286 287 // newPebbleMap creates a new pebbleMap with the passed in Engine as the 288 // underlying store. The pebbleMap instance will have a keyspace prefixed by a 289 // unique prefix. The allowDuplicates parameter controls whether Puts with 290 // identical keys will write multiple entries or overwrite previous entries. 291 func newPebbleMap(e *pebble.DB, allowDuplicates bool) *pebbleMap { 292 prefix := generateTempStorageID() 293 return &pebbleMap{ 294 prefix: encoding.EncodeUvarintAscending([]byte(nil), prefix), 295 store: e, 296 allowDuplicates: allowDuplicates, 297 } 298 } 299 300 // makeKey appends k to the pebbleMap's prefix to keep the key local to this 301 // instance and returns a byte slice containing the user-provided key and the 302 // prefix. Pebble's operations can take this byte slice as a key. This key is 303 // only valid until the next call to makeKey. 304 func (r *pebbleMap) makeKey(k []byte) []byte { 305 prefixLen := len(r.prefix) 306 r.prefix = append(r.prefix, k...) 307 key := r.prefix 308 r.prefix = r.prefix[:prefixLen] 309 return key 310 } 311 312 // makeKeyWithSequence makes a key appropriate for a Put operation. It is like 313 // makeKey except it respects allowDuplicates, by appending a sequence number to 314 // the user-provided key. 315 func (r *pebbleMap) makeKeyWithSequence(k []byte) []byte { 316 byteKey := r.makeKey(k) 317 if r.allowDuplicates { 318 r.keyID++ 319 byteKey = encoding.EncodeUint64Ascending(byteKey, uint64(r.keyID)) 320 } 321 return byteKey 322 } 323 324 // NewIterator implements the SortedDiskMap interface. 325 func (r *pebbleMap) NewIterator() diskmap.SortedDiskMapIterator { 326 return &pebbleMapIterator{ 327 allowDuplicates: r.allowDuplicates, 328 iter: r.store.NewIter(&pebble.IterOptions{ 329 UpperBound: roachpb.Key(r.prefix).PrefixEnd(), 330 }), 331 makeKey: r.makeKey, 332 prefix: r.prefix, 333 } 334 } 335 336 // NewBatchWriter implements the SortedDiskMap interface. 337 func (r *pebbleMap) NewBatchWriter() diskmap.SortedDiskMapBatchWriter { 338 return r.NewBatchWriterCapacity(defaultBatchCapacityBytes) 339 } 340 341 // NewBatchWriterCapacity implements the SortedDiskMap interface. 342 func (r *pebbleMap) NewBatchWriterCapacity(capacityBytes int) diskmap.SortedDiskMapBatchWriter { 343 makeKey := r.makeKey 344 if r.allowDuplicates { 345 makeKey = r.makeKeyWithSequence 346 } 347 return &pebbleMapBatchWriter{ 348 capacity: capacityBytes, 349 makeKey: makeKey, 350 batch: r.store.NewBatch(), 351 store: r.store, 352 } 353 } 354 355 // Clear implements the SortedDiskMap interface. 356 func (r *pebbleMap) Clear() error { 357 if err := r.store.DeleteRange( 358 r.prefix, 359 roachpb.Key(r.prefix).PrefixEnd(), 360 pebble.NoSync, 361 ); err != nil { 362 return errors.Wrapf(err, "unable to clear range with prefix %v", r.prefix) 363 } 364 // NB: we manually flush after performing the clear range to ensure that the 365 // range tombstone is pushed to disk which will kick off compactions that 366 // will eventually free up the deleted space. 367 _, err := r.store.AsyncFlush() 368 return err 369 } 370 371 // Close implements the SortedDiskMap interface. 372 func (r *pebbleMap) Close(ctx context.Context) { 373 if err := r.Clear(); err != nil { 374 log.Errorf(ctx, "%v", err) 375 } 376 } 377 378 // SeekGE implements the SortedDiskMapIterator interface. 379 func (i *pebbleMapIterator) SeekGE(k []byte) { 380 i.iter.SeekGE(i.makeKey(k)) 381 } 382 383 // Rewind implements the SortedDiskMapIterator interface. 384 func (i *pebbleMapIterator) Rewind() { 385 i.iter.SeekGE(i.makeKey(nil)) 386 } 387 388 // Valid implements the SortedDiskMapIterator interface. 389 func (i *pebbleMapIterator) Valid() (bool, error) { 390 return i.iter.Valid(), nil 391 } 392 393 // Next implements the SortedDiskMapIterator interface. 394 func (i *pebbleMapIterator) Next() { 395 i.iter.Next() 396 } 397 398 // UnsafeKey implements the SortedDiskMapIterator interface. 399 func (i *pebbleMapIterator) UnsafeKey() []byte { 400 unsafeKey := i.iter.Key() 401 end := len(unsafeKey) 402 if i.allowDuplicates { 403 // There are 8 bytes of sequence number at the end of the key, remove them. 404 end -= 8 405 } 406 return unsafeKey[len(i.prefix):end] 407 } 408 409 // UnsafeValue implements the SortedDiskMapIterator interface. 410 func (i *pebbleMapIterator) UnsafeValue() []byte { 411 return i.iter.Value() 412 } 413 414 // Close implements the SortedDiskMapIterator interface. 415 func (i *pebbleMapIterator) Close() { 416 _ = i.iter.Close() 417 } 418 419 // Put implements the SortedDiskMapBatchWriter interface. 420 func (b *pebbleMapBatchWriter) Put(k []byte, v []byte) error { 421 key := b.makeKey(k) 422 if err := b.batch.Set(key, v, nil); err != nil { 423 return err 424 } 425 b.numPutsSinceFlush++ 426 if len(b.batch.Repr()) >= b.capacity { 427 return b.Flush() 428 } 429 return nil 430 } 431 432 // Flush implements the SortedDiskMapBatchWriter interface. 433 func (b *pebbleMapBatchWriter) Flush() error { 434 if err := b.batch.Commit(pebble.NoSync); err != nil { 435 return err 436 } 437 b.numPutsSinceFlush = 0 438 b.batch = b.store.NewBatch() 439 return nil 440 } 441 442 // NumPutsSinceFlush implements the SortedDiskMapBatchWriter interface. 443 func (b *pebbleMapBatchWriter) NumPutsSinceFlush() int { 444 return b.numPutsSinceFlush 445 } 446 447 // Close implements the SortedDiskMapBatchWriter interface. 448 func (b *pebbleMapBatchWriter) Close(ctx context.Context) error { 449 err := b.Flush() 450 if err != nil { 451 return err 452 } 453 return b.batch.Close() 454 }